Merge branch 'demo131' into Issue/862

8d09630a · gongchensu · GitHub · ab52dead · 012df56c · 8d09630a
Unverified Commit 8d09630a authored Feb 11, 2026 by gongchensu Committed by GitHub Feb 11, 2026
20 changed files
--- a/src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc
+++ b/src/infinicore/ops/dequantize_awq/dequantize_awq_infiniop.cc
+#include "../../utils.hpp"
+#include "../infiniop_impl.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/dequantize_awq.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::dequantize_awq_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, DequantizeAWQ, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, x_packed, x_scale, x_zeros;
+};
+
+void *plan(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
+    size_t seed = hash_combine(x, x_packed, x_scale, x_zeros);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, DequantizeAWQ,
+        seed,
+        x->desc(), x_packed->desc(), x_scale->desc(), x_zeros->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, DequantizeAWQ, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(x_packed),
+        graph::GraphTensor(x_scale),
+        graph::GraphTensor(x_zeros)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopDequantizeAWQ(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->x_packed->data(),
+        planned->x_scale->data(),
+        planned->x_zeros->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(DequantizeAWQ, &plan, &run, &cleanup);
+} // namespace infinicore::op::dequantize_awq_impl::infiniop
--- a/src/infinicore/ops/distributed/allreduce.cc
+++ b/src/infinicore/ops/distributed/allreduce.cc
+#include "infinicore/ops/distributed/allreduce.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op::distributed {
+
+struct PlannedMeta {
+    graph::GraphTensor output, input;
+    infinicclReduceOp_t op;
+    infinicclComm_t communicator;
+};
+
+AllReduce::AllReduce(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    INFINICORE_ASSERT(output->is_contiguous() && input->is_contiguous());
+    INFINICORE_ASSERT(output->numel() == input->numel());
+    planned_meta_ = new PlannedMeta{graph::GraphTensor(output), graph::GraphTensor(input), op, communicator};
+}
+AllReduce::~AllReduce() {
+    if (planned_meta_) {
+        PlannedMeta *meta = reinterpret_cast<PlannedMeta *>(planned_meta_);
+        delete meta;
+    }
+}
+
+void AllReduce::run() const {
+    PlannedMeta *meta = reinterpret_cast<PlannedMeta *>(planned_meta_);
+
+    INFINICORE_CHECK_ERROR(infinicclAllReduce(meta->input->data(),
+                                              meta->output->data(),
+                                              meta->input->numel(),
+                                              static_cast<infiniDtype_t>(static_cast<int>(meta->input->dtype())),
+                                              meta->op,
+                                              meta->communicator,
+                                              infinicore::context::getStream()));
+}
+
+void AllReduce::execute(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(AllReduce, output, input, op, communicator);
+}
+
+Tensor allreduce(const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    allreduce_(output, input, op, communicator);
+    return output;
+}
+
+void allreduce_(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator) {
+    AllReduce::execute(output, input, op, communicator);
+}
+} // namespace infinicore::op::distributed
--- a/src/infinicore/ops/embedding/embedding.cc
+++ b/src/infinicore/ops/embedding/embedding.cc
 #include "infinicore/ops/embedding.hpp"
-#include "infinicore/context/context.hpp"
-#include <cstring>
+
+#include "../../utils.hpp"

 namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Embedding);
+
+Embedding::Embedding(Tensor out, const Tensor &input, const Tensor &weight) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, weight);
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, input, weight);
+}

-Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the indices to extract
-                 Tensor weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
+void Embedding::execute(Tensor out, const Tensor &input, const Tensor &weight) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Embedding, out, input, weight);
+}
+
+Tensor embedding(const Tensor &input, // LongTensor of arbitrary shape containing the indices to extract
+                 const Tensor &weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
 ) {
    auto input_shape = input->shape();
    auto weight_shape = weight->shape();
-    // auto vocab_size = weight_shape[0];
    auto embedding_dim = weight_shape[1];

    // Assign memory to out variables
@@ -21,69 +30,8 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
    return inputs_embeds;
 }

-void embedding_(Tensor out, Tensor input, Tensor weight) {
-    assert(infinicore::DataType::I64 == input->dtype() || (infinicore::DataType::I32 == input->dtype()));
-    assert(infinicore::Device::Type::CPU == input->device().getType());
-
-    auto input_shape = input->shape();
-    auto weight_shape = weight->shape();
-    auto embedding_dim = weight_shape[1];
-
-    // Calculate the number of token
-    Size counts = 1;
-    for (auto &v : input_shape) {
-        counts *= v;
-    }
-
-    // the bytes of one token
-    const Size bytes = dsize(weight->dtype()) * embedding_dim;
-    auto *weight_ptr = weight->data();
-    auto *out_ptr = out->data();
-
-    // copies
-    if (weight->device().getType() == Device::Type::CPU) {
-        if (infinicore::DataType::I64 == input->dtype()) {
-            const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
-            for (Size i = 0; i < counts; ++i) {
-                int64_t idx = input_arr[i];
-                assert((idx >= 0) && (idx < weight_shape[0]));
-                std::memcpy(out_ptr + i * bytes,
-                            weight_ptr + idx * bytes,
-                            bytes);
-            }
-        } else if (infinicore::DataType::I32 == input->dtype()) {
-            const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
-
-            for (Size i = 0; i < counts; ++i) {
-                int32_t idx = input_arr[i];
-                assert((idx >= 0) && (idx < weight_shape[0]));
-                std::memcpy(out_ptr + i * bytes,
-                            weight_ptr + idx * bytes,
-                            bytes);
-            }
-        }
-
-    } else {
-        if (infinicore::DataType::I64 == input->dtype()) {
-            const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
-            for (Size i = 0; i < counts; ++i) {
-                int64_t idx = input_arr[i];
-                assert((idx >= 0) && (idx < weight_shape[0]));
-                context::memcpyD2D(out_ptr + i * bytes,
-                                   weight_ptr + idx * bytes,
-                                   bytes);
-            }
-        } else if (infinicore::DataType::I32 == input->dtype()) {
-            const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
-            for (Size i = 0; i < counts; ++i) {
-                int32_t idx = input_arr[i];
-                assert((idx >= 0) && (idx < weight_shape[0]));
-                context::memcpyD2D(out_ptr + i * bytes,
-                                   weight_ptr + idx * bytes,
-                                   bytes);
-            }
-        }
-    }
+void embedding_(Tensor out, const Tensor &input, const Tensor &weight) {
+    Embedding::execute(out, input, weight);
 }

 } // namespace infinicore::op
--- a/src/infinicore/ops/embedding/embedding_infiniop.cc
+++ b/src/infinicore/ops/embedding/embedding_infiniop.cc
+#include "../infiniop_impl.hpp"
+#include "infinicore/ops/embedding.hpp"
+
+namespace infinicore::op::embedding_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Embedding, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor out, input, weight;
+};
+
+void *plan(Tensor out, const Tensor &input, const Tensor &weight) {
+    size_t seed = hash_combine(out, input, weight);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Embedding,
+        seed, out->desc(), input->desc(), weight->desc());
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(out),
+        graph::GraphTensor(input),
+        graph::GraphTensor(weight)};
+
+    return planned;
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopEmbedding(
+        planned->descriptor->desc,
+        planned->out->data(), planned->input->data(), planned->weight->data(), context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Embedding, &plan, &run, cleanup);
+
+} // namespace infinicore::op::embedding_impl::infiniop
--- a/src/infinicore/ops/flash_attention/flash_attention.cc
+++ b/src/infinicore/ops/flash_attention/flash_attention.cc
+#include "infinicore/ops/flash_attention.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(FlashAttention);
+
+FlashAttention::FlashAttention(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k, v);
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(),
+                                 out, q, k, v, total_kv_len, scale, is_causal);
+}
+
+void FlashAttention::execute(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(FlashAttention, out, q, k, v, total_kv_len, scale, is_causal);
+}
+
+Tensor flash_attention(const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
+    Shape shape = q->shape();
+    int idx = shape.size() - 1;
+    shape[idx] = v->shape()[idx];
+    auto out = Tensor::empty(shape, q->dtype(), q->device());
+    flash_attention_(out, q, k, v, total_kv_len, scale, is_causal);
+    return out;
+}
+
+void flash_attention_(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
+    FlashAttention::execute(out, q, k, v, total_kv_len, scale, is_causal);
+}
+} // namespace infinicore::op
--- a/src/infinicore/ops/flash_attention/flash_attention_infiniop.cc
+++ b/src/infinicore/ops/flash_attention/flash_attention_infiniop.cc
+#include "../../utils.hpp"
+#include "../infiniop_impl.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/flash_attention.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::flash_attention_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, FlashAttention, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, out, q, k, v, total_kv_len;
+    float scale;
+    bool is_causal;
+};
+
+void *plan(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
+    size_t seed = hash_combine(out, q, k, v, total_kv_len, scale, is_causal);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, FlashAttention,
+        seed, out->desc(), q->desc(), k->desc(), v->desc(), total_kv_len->desc(), scale, is_causal);
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, FlashAttention, descriptor);
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(out),
+        graph::GraphTensor(q),
+        graph::GraphTensor(k),
+        graph::GraphTensor(v),
+        graph::GraphTensor(total_kv_len), scale, is_causal};
+
+    return planned;
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopFlashAttention(
+        planned->descriptor->desc, planned->workspace->data(), planned->workspace->numel(),
+        planned->out->data(), planned->q->data(), planned->k->data(), planned->v->data(), planned->total_kv_len->data(), context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(FlashAttention, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::flash_attention_impl::infiniop
--- a/src/infinicore/ops/gemm/gemm.cc
+++ b/src/infinicore/ops/gemm/gemm.cc
@@ -3,19 +3,18 @@
 #include "../../utils.hpp"

 namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Gemm);

-common::OpDispatcher<Gemm::schema> &Gemm::dispatcher() {
-    static common::OpDispatcher<Gemm::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void Gemm::execute(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
+Gemm::Gemm(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
-    infinicore::context::setDevice(c->device());
-    dispatcher().lookup(c->device().getType())(c, a, b, alpha, beta);
+    INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b, alpha, beta);
+}
+
+void Gemm::execute(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Gemm, c, a, b, alpha, beta);
 }

-Tensor gemm(Tensor a, Tensor b, float alpha, float beta) {
+Tensor gemm(const Tensor &a, const Tensor &b, float alpha, float beta) {
    Shape shape = a->shape();
    Size size = a->ndim();
    shape[size - 1] = b->size(size - 1);
@@ -24,7 +23,7 @@ Tensor gemm(Tensor a, Tensor b, float alpha, float beta) {
    return c;
 }

-void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
+void gemm_(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta) {
    Gemm::execute(c, a, b, alpha, beta);
 }


--- a/src/infinicore/ops/gemm/gemm_infiniop.cc
+++ b/src/infinicore/ops/gemm/gemm_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
+#include "../infiniop_impl.hpp"
 #include "infinicore/ops/gemm.hpp"
-#include <infiniop.h>

 namespace infinicore::op::gemm_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopGemmDescriptor_t> caches(
-    100, // capacity
-    [](infiniopGemmDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyGemmDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
-    size_t seed = hash_combine(c, b, a, alpha, beta);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopGemmDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            c->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetGemmWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Gemm, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, c, a, b;
+    float alpha, beta;
+};
+
+void *plan(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta) {
+    size_t seed = hash_combine(c, a, b);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Gemm,
+        seed, c->desc(), a->desc(), b->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, Gemm, descriptor);
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(c),
+        graph::GraphTensor(a),
+        graph::GraphTensor(b),
+        alpha, beta};
+
+    return planned;
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

    INFINICORE_CHECK_ERROR(infiniopGemm(
-        desc, workspace->data(), workspace_size,
-        c->data(), a->data(), b->data(), alpha, beta, context::getStream()));
+        planned->descriptor->desc, planned->workspace->data(), planned->workspace->numel(),
+        planned->c->data(), planned->a->data(), planned->b->data(), planned->alpha, planned->beta, context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    Gemm::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Gemm, &plan, &run, &cleanup);

 } // namespace infinicore::op::gemm_impl::infiniop
--- a/src/infinicore/ops/infiniop_impl.hpp
+++ b/src/infinicore/ops/infiniop_impl.hpp
+#pragma once
+
+#include "../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__) \
+    struct __DESC_TYPE__ {                                                 \
+        infiniop##__OP_NAME__##Descriptor_t desc = nullptr;                \
+                                                                           \
+        explicit __DESC_TYPE__(infiniop##__OP_NAME__##Descriptor_t d)      \
+            : desc(d) {}                                                   \
+                                                                           \
+        /* non-copyable */                                                 \
+        __DESC_TYPE__(const __DESC_TYPE__ &) = delete;                     \
+        __DESC_TYPE__ &operator=(const __DESC_TYPE__ &) = delete;          \
+                                                                           \
+        /* movable */                                                      \
+        __DESC_TYPE__(__DESC_TYPE__ &&other) noexcept                      \
+            : desc(other.desc) {                                           \
+            other.desc = nullptr;                                          \
+        }                                                                  \
+                                                                           \
+        __DESC_TYPE__ &operator=(__DESC_TYPE__ &&other) noexcept {         \
+            if (this != &other) {                                          \
+                if (desc != nullptr) {                                     \
+                    infiniopDestroy##__OP_NAME__##Descriptor(desc);        \
+                }                                                          \
+                desc = other.desc;                                         \
+                other.desc = nullptr;                                      \
+            }                                                              \
+            return *this;                                                  \
+        }                                                                  \
+                                                                           \
+        ~__DESC_TYPE__() {                                                 \
+            if (desc != nullptr) {                                         \
+                infiniopDestroy##__OP_NAME__##Descriptor(desc);            \
+            }                                                              \
+        }                                                                  \
+    };                                                                     \
+                                                                           \
+    thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>>   \
+        caches(                                                            \
+            __SIZE__,                                                      \
+            [](std::shared_ptr<__DESC_TYPE__> &desc) {                     \
+                desc = nullptr;                                            \
+            });
+
+#define INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(__DESC_TYPE__, __DESC_NAME__, __INFINIOP_NAME__, __HASH_KEY__, ...) \
+    std::shared_ptr<__DESC_TYPE__> __DESC_NAME__;                                                                      \
+    {                                                                                                                  \
+        auto device__ = context::getDevice();                                                                          \
+        auto &cache__ = caches.getCache(device__);                                                                     \
+        __DESC_NAME__ = cache__.get(__HASH_KEY__).value_or(nullptr);                                                   \
+        if (!__DESC_NAME__) {                                                                                          \
+            __DESC_NAME__ = std::make_shared<__DESC_TYPE__>(nullptr);                                                  \
+            INFINICORE_CHECK_ERROR(infiniopCreate##__INFINIOP_NAME__##Descriptor(                                      \
+                context::getInfiniopHandle(device__),                                                                  \
+                &__DESC_NAME__->desc,                                                                                  \
+                __VA_ARGS__));                                                                                         \
+            cache__.put(__HASH_KEY__, __DESC_NAME__);                                                                  \
+        }                                                                                                              \
+    }
+
+#define INFINIOP_WORKSPACE_TENSOR(__TENSOR_NAME__, __INFINIOP_NAME__, __DESC_NAME__)                                 \
+    Tensor __TENSOR_NAME__;                                                                                          \
+    {                                                                                                                \
+        auto device__ = context::getDevice();                                                                        \
+        size_t workspace_size = 0;                                                                                   \
+        INFINICORE_CHECK_ERROR(infiniopGet##__INFINIOP_NAME__##WorkspaceSize(__DESC_NAME__->desc, &workspace_size)); \
+        __TENSOR_NAME__ = Tensor::empty({workspace_size}, DataType::U8, device__);                                   \
+    }
--- a/src/infinicore/ops/kv_caching/kv_caching.cc
+++ b/src/infinicore/ops/kv_caching/kv_caching.cc
+#include "infinicore/ops/kv_caching.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(KVCaching);
+
+KVCaching::KVCaching(Tensor k_cache,
+                     Tensor v_cache,
+                     const Tensor &k,
+                     const Tensor &v,
+                     const Tensor &past_kv_lengths) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k_cache, v_cache, k, v, past_kv_lengths);
+    INFINICORE_GRAPH_OP_DISPATCH(k_cache->device().getType(),
+                                 k_cache,
+                                 v_cache,
+                                 k,
+                                 v,
+                                 past_kv_lengths);
+}
+
+void KVCaching::execute(Tensor k_cache,
+                        Tensor v_cache,
+                        const Tensor &k,
+                        const Tensor &v,
+                        const Tensor &past_kv_lengths) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(KVCaching,
+                                      k_cache,
+                                      v_cache,
+                                      k,
+                                      v,
+                                      past_kv_lengths);
+}
+
+void kv_caching_(Tensor k_cache,
+                 Tensor v_cache,
+                 const Tensor &k,
+                 const Tensor &v,
+                 const Tensor &past_kv_lengths) {
+    KVCaching::execute(k_cache, v_cache, k, v, past_kv_lengths);
+}
+} // namespace infinicore::op
--- a/src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
+++ b/src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
+#include "../infiniop_impl.hpp"
+#include "infinicore/ops/kv_caching.hpp"
+
+namespace infinicore::op::kv_caching_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, KVCaching, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, k_cache, v_cache, k, v, past_kv_lengths;
+};
+
+void *plan(Tensor k_cache,
+           Tensor v_cache,
+           const Tensor &k,
+           const Tensor &v,
+           const Tensor &past_kv_lengths) {
+    size_t seed = hash_combine(k_cache, v_cache, k, v, past_kv_lengths);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, KVCaching,
+        seed, k_cache->desc(), v_cache->desc(),
+        k->desc(), v->desc(), past_kv_lengths->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, KVCaching, descriptor);
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(k_cache),
+        graph::GraphTensor(v_cache),
+        graph::GraphTensor(k),
+        graph::GraphTensor(v),
+        graph::GraphTensor(past_kv_lengths)};
+
+    return planned;
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopKVCaching(
+        planned->descriptor->desc,
+        nullptr, 0,
+        planned->k_cache->data(),
+        planned->v_cache->data(),
+        planned->k->data(),
+        planned->v->data(),
+        planned->past_kv_lengths->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(KVCaching, &plan, &run, cleanup);
+
+} // namespace infinicore::op::kv_caching_impl::infiniop
--- a/src/infinicore/ops/linear/linear.cc
+++ b/src/infinicore/ops/linear/linear.cc
 #include "infinicore/ops/linear.hpp"
-#include "infinicore/ops/add.hpp"
-#include "infinicore/ops/matmul.hpp"
+#include "infinicore/ops/gemm.hpp"
+#include "infinicore/ops/rearrange.hpp"

 namespace infinicore::op {

@@ -42,16 +42,18 @@ void linear_(Tensor out,

    // linear transformation
    Tensor out_view = out->view({N, out_features});
-    matmul_(out_view,
-            input->view({N, in_features}),
-            weight->permute({1, 0}));
-
    // Add bias
+    float alpha = 1.0f;
+    float beta = 0.0f;
    if (bias.has_value()) {
-        add_(out_view,
-             out_view,
-             bias.value()->as_strided({N, out_features}, {0, 1}));
+        rearrange_(out_view,
+                   bias.value()->as_strided({N, out_features}, {0, 1}));
+        beta = 1.0f;
    }
+
+    gemm_(out_view,
+          input->view({N, in_features}),
+          weight->permute({1, 0}), alpha, beta);
 }

 } // namespace infinicore::op
--- a/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
+++ b/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
+#include "infinicore/ops/linear_w4a16_awq.hpp"
+#include "infinicore/ops/dequantize_awq.hpp"
+#include "infinicore/ops/gemm.hpp"
+
+namespace infinicore::op {
+
+Tensor linear_w4a16_awq(Tensor input,
+                        Tensor weight_packed,
+                        Tensor weight_scale,
+                        Tensor weight_zeros,
+                        std::optional<Tensor> bias) {
+
+    // Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
+    Size ndim = input->ndim();
+    Size out_features = weight_packed->shape()[0];
+
+    // Assign memory to out variables
+    auto output_shape = input->shape();
+    output_shape[ndim - 1] = out_features;
+    auto out = Tensor::empty(output_shape, input->dtype(), input->device());
+
+    // Inplace Calculate
+    linear_w4a16_awq_(out, input, weight_packed, weight_scale, weight_zeros, bias);
+    return out;
+}
+
+void linear_w4a16_awq_(Tensor out,
+                       Tensor input,
+                       Tensor weight_packed,
+                       Tensor weight_scale,
+                       Tensor weight_zeros,
+                       std::optional<Tensor> bias) {
+
+    auto weight_packed_shape = weight_packed->shape();
+    Size out_features = weight_packed_shape[0];
+    Size in_features = weight_packed_shape[1];
+
+    Size ndim = input->ndim();
+    assert(out->ndim() == ndim);
+
+    Size N = 1;
+    auto input_shape = input->shape();
+    for (size_t i = 0; i < ndim - 1; ++i) {
+        N *= input_shape[i];
+    }
+
+    auto weight = Tensor::empty(
+        {out_features, in_features},
+        out->dtype(),
+        weight_packed->device());
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    op::dequantize_awq_(weight, weight_packed, weight_scale, weight_zeros);
+    bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1}));
+    gemm_(out->view({N, out_features}),
+          input->view({N, in_features}),
+          weight->permute({1, 0}), alpha, beta);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
+++ b/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
+#include "infinicore/ops/linear_w8a8i8.hpp"
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+#include "infinicore/ops/scaled_mm_i8.hpp"
+
+namespace infinicore::op {
+
+Tensor linear_w8a8i8(Tensor input,
+                     Tensor weight_packed,
+                     Tensor weight_scale,
+                     std::optional<Tensor> bias) {
+
+    // Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
+    Size ndim = input->ndim();
+    Size out_features = weight_packed->shape()[0];
+
+    // Assign memory to out variables
+    auto output_shape = input->shape();
+    output_shape[ndim - 1] = out_features;
+    auto out = Tensor::empty(output_shape, input->dtype(), input->device());
+
+    // Inplace Calculate
+    linear_w8a8i8_(out, input, weight_packed, weight_scale, bias);
+    return out;
+}
+
+void linear_w8a8i8_(Tensor out,
+                    Tensor input,
+                    Tensor weight_packed,
+                    Tensor weight_scale,
+                    std::optional<Tensor> bias) {
+
+    auto weight_packed_shape = weight_packed->shape();
+    Size out_features = weight_packed_shape[0];
+    Size in_features = weight_packed_shape[1];
+
+    Size ndim = input->ndim();
+    assert(out->ndim() == ndim);
+
+    Size N = 1;
+    auto input_shape = input->shape();
+    for (size_t i = 0; i < ndim - 1; ++i) {
+        N *= input_shape[i];
+    }
+
+    auto input_packed = Tensor::empty(
+        {N, input_shape[ndim - 1]},
+        DataType::I8,
+        input->device());
+    auto input_scale = Tensor::empty(
+        {N, 1},
+        DataType::F32,
+        input->device());
+    op::per_channel_quant_i8_(input->view({N, in_features}), input_packed, input_scale);
+    if (bias.has_value()) {
+        bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1}));
+    }
+    op::scaled_mm_i8_(
+        out->view({N, out_features}),
+        input_packed,
+        input_scale,
+        weight_packed->permute({1, 0}),
+        weight_scale,
+        bias);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/mul/mul.cc
+++ b/src/infinicore/ops/mul/mul.cc
 #include "infinicore/ops/mul.hpp"
-
 #include "../../utils.hpp"

 namespace infinicore::op {

-common::OpDispatcher<Mul::schema> &Mul::dispatcher() {
-    static common::OpDispatcher<Mul::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Mul);

-void Mul::execute(Tensor c, Tensor a, Tensor b) {
+Mul::Mul(Tensor c, const Tensor &a, const Tensor &b) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
-    infinicore::context::setDevice(c->device());
-    dispatcher().lookup(c->device().getType())(c, a, b);
+    INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b);
+}
+
+void Mul::execute(Tensor c, const Tensor &a, const Tensor &b) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Mul, c, a, b);
 }

-Tensor mul(Tensor a, Tensor b) {
+Tensor mul(const Tensor &a, const Tensor &b) {
    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
    mul_(c, a, b);
    return c;
 }

-void mul_(Tensor c, Tensor a, Tensor b) {
+void mul_(Tensor c, const Tensor &a, const Tensor &b) {
    Mul::execute(c, a, b);
 }


--- a/src/infinicore/ops/mul/mul_infiniop.cc
+++ b/src/infinicore/ops/mul/mul_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/mul.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::mul_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopMulDescriptor_t> caches(
-    100, // capacity
-    [](infiniopMulDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyMulDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Mul, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, c, a, b;
+};

-void calculate(Tensor c, Tensor a, Tensor b) {
+void *plan(Tensor c, const Tensor &a, const Tensor &b) {
    size_t seed = hash_combine(c, b, a);

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Mul,
+        seed, c->desc(), a->desc(), b->desc());

-    auto desc_opt = cache.get(seed);
-    infiniopMulDescriptor_t desc = nullptr;
+    INFINIOP_WORKSPACE_TENSOR(workspace, Mul, descriptor);

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateMulDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            c->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(c),
+        graph::GraphTensor(a),
+        graph::GraphTensor(b)};
+}

-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetMulWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

    INFINICORE_CHECK_ERROR(infiniopMul(
-        desc, workspace->data(), workspace_size,
-        c->data(), a->data(), b->data(), context::getStream()));
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->c->data(),
+        planned->a->data(),
+        planned->b->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    Mul::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Mul, &plan, &run, &cleanup);

 } // namespace infinicore::op::mul_impl::infiniop
--- a/src/infinicore/ops/paged_attention/paged_attention.cc
+++ b/src/infinicore/ops/paged_attention/paged_attention.cc
+#include "infinicore/ops/paged_attention.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PagedAttention);
+
+PagedAttention::PagedAttention(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+                               const Tensor &block_tables, const Tensor &kv_lens,
+                               std::optional<Tensor> alibi_slopes, float scale) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens);
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(),
+                                 out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
+}
+
+void PagedAttention::execute(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+                             const Tensor &block_tables, const Tensor &kv_lens,
+                             std::optional<Tensor> alibi_slopes, float scale) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(
+        PagedAttention,
+        out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
+}
+
+Tensor paged_attention(const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+                       const Tensor &block_tables, const Tensor &kv_lens,
+                       std::optional<Tensor> alibi_slopes, float scale) {
+    auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
+    paged_attention_(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
+    return out;
+}
+
+void paged_attention_(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+                      const Tensor &block_tables, const Tensor &kv_lens,
+                      std::optional<Tensor> alibi_slopes, float scale) {
+    PagedAttention::execute(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
+++ b/src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
+#include "infinicore/ops/paged_attention.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::paged_attention_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PagedAttention, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, out, q, k_cache, v_cache, block_tables, cache_lens;
+    std::optional<graph::GraphTensor> alibi_slopes;
+    float scale;
+};
+
+void *plan(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+           const Tensor &block_tables, const Tensor &cache_lens,
+           std::optional<Tensor> alibi_slopes, float scale) {
+    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, PagedAttention,
+        seed,
+        out->desc(), q->desc(), k_cache->desc(), v_cache->desc(),
+        block_tables->desc(), cache_lens->desc(),
+        alibi_slopes ? alibi_slopes.value()->desc() : nullptr,
+        scale);
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, PagedAttention, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(out),
+        graph::GraphTensor(q),
+        graph::GraphTensor(k_cache),
+        graph::GraphTensor(v_cache),
+        graph::GraphTensor(block_tables),
+        graph::GraphTensor(cache_lens),
+        alibi_slopes ? std::optional<graph::GraphTensor>(graph::GraphTensor(*alibi_slopes)) : std::nullopt,
+        scale};
+}
+
+void run(void *planned_meta) {
+    auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(
+        infiniopPagedAttention(
+            p->descriptor->desc,
+            p->workspace->data(),
+            p->workspace->numel(),
+            p->out->data(),
+            p->q->data(),
+            p->k_cache->data(),
+            p->v_cache->data(),
+            p->block_tables->data(),
+            p->cache_lens->data(),
+            p->alibi_slopes.has_value() ? p->alibi_slopes.value()->data() : nullptr,
+            context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PagedAttention, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::paged_attention_impl::infiniop
--- a/src/infinicore/ops/paged_attention_prefill/paged_attention_prefill.cc
+++ b/src/infinicore/ops/paged_attention_prefill/paged_attention_prefill.cc
+#include "infinicore/ops/paged_attention_prefill.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<PagedAttentionPrefill::schema> &PagedAttentionPrefill::dispatcher() {
+    static common::OpDispatcher<PagedAttentionPrefill::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void PagedAttentionPrefill::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
+                                    Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
+                                    std::optional<Tensor> alibi_slopes, float scale) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q);
+
+    infinicore::context::setDevice(out->device());
+
+    dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables,
+                                                 kv_lens, cum_seqlens_q, alibi_slopes, scale);
+}
+
+Tensor paged_attention_prefill(Tensor q, Tensor k_cache, Tensor v_cache,
+                               Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
+                               std::optional<Tensor> alibi_slopes, float scale) {
+
+    auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
+    paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
+    return out;
+}
+
+void paged_attention_prefill_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
+                              Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
+                              std::optional<Tensor> alibi_slopes, float scale) {
+
+    PagedAttentionPrefill::execute(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/paged_attention_prefill/paged_attention_prefill_infiniop.cc
+++ b/src/infinicore/ops/paged_attention_prefill/paged_attention_prefill_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/paged_attention_prefill.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::paged_attention_prefill_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopPagedAttentionPrefillDescriptor_t> caches(
+    100, // capacity
+    [](infiniopPagedAttentionPrefillDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyPagedAttentionPrefillDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
+               Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
+               std::optional<Tensor> alibi_slopes, float scale) {
+    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopPagedAttentionPrefillDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreatePagedAttentionPrefillDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            out->desc(),
+            q->desc(),
+            k_cache->desc(),
+            v_cache->desc(),
+            block_tables->desc(),
+            kv_lens->desc(),
+            cum_seqlens_q->desc(),
+            alibi_slopes.has_value() ? alibi_slopes.value()->desc() : nullptr,
+            scale));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetPagedAttentionPrefillWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopPagedAttentionPrefill(
+        desc,
+        workspace->data(),
+        workspace_size,
+        out->data(),
+        q->data(),
+        k_cache->data(),
+        v_cache->data(),
+        block_tables->data(),
+        kv_lens->data(),
+        cum_seqlens_q->data(),
+        alibi_slopes.has_value() ? alibi_slopes.value()->data() : nullptr,
+        context::getStream()));
+}
+
+static bool registered = []() {
+    PagedAttentionPrefill::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::paged_attention_prefill_impl::infiniop