Merge branch 'demo131' into Issue/862

8d09630a · gongchensu · GitHub · ab52dead · 012df56c · 8d09630a
Unverified Commit 8d09630a authored Feb 11, 2026 by gongchensu Committed by GitHub Feb 11, 2026
20 changed files
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "third_party/spdlog"]
 	path = third_party/spdlog
 	url = https://github.com/gabime/spdlog.git
+[submodule "third_party/nlohmann_json"]
+	path = third_party/nlohmann_json
+	url = https://github.com/nlohmann/json.git
+	branch = master
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ InfiniCore 是一个跨平台统一编程工具集，为不同芯片平台的功
  - 天数智芯 GPU；
  - 沐曦 GPU；
  - 海光 DCU；
+  - 阿里 PPU；
 - 华为昇腾 NPU；
 - 寒武纪 MLU；
 - 昆仑芯 XPU；
@@ -103,6 +104,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
 | `--qy-gpu=[y\|n]`        | 是否编译QY GPU 接口实现           | n
 | `--hygon-dcu=[y\|n]`     | 是否编译海光 DCU 接口实现         | n
 | `--kunlun-xpu=[y\|n]`    | 是否编译昆仑 XPU 接口实现         | n
+| `--ali-ppu=[y\|n]`       | 是否编译阿里 PPU 接口实现         | n
 | `--ninetoothed=[y\|n]`   | 是否编译九齿实现                 | n
 | `--ccl=[y\|n]`           | 是否编译 InfiniCCL 通信库接口实现 | n
@@ -187,9 +189,9 @@ pip install -e .
 ```bash
 # 测试单算子
-python test/infinicore/ops/[operator].py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
+python test/infinicore/ops/[operator].py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
 # 测试全部算子
-python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun]
+python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --ali]
 ```
 使用 -h 查看更多参数。
@@ -198,9 +200,9 @@ python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia
 ```shell
 # 测试单算子
-python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
+python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
 # 测试全部算子
-python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
+python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
 ```
 #### 通信库（InfiniCCL）测试

--- a/include/infinicore.h
+++ b/include/infinicore.h
@@ -47,6 +47,7 @@ typedef enum {
    INFINI_DEVICE_KUNLUN = 7,
    INFINI_DEVICE_HYGON = 8,
    INFINI_DEVICE_QY = 9,
+    INFINI_DEVICE_ALI = 10,
    INFINI_DEVICE_TYPE_COUNT
 } infiniDevice_t;

--- a/include/infinicore.hpp
+++ b/include/infinicore.hpp
@@ -3,4 +3,5 @@
 #include "infinicore/device_event.hpp"
 #include "infinicore/nn.hpp"
 #include "infinicore/ops.hpp"
+#include "infinicore/quantization.hpp"
 #include "infinicore/tensor.hpp"
--- a/include/infinicore/common/hash.hpp
+++ b/include/infinicore/common/hash.hpp
@@ -2,6 +2,7 @@
 #include "../tensor.hpp"
+#include <optional>
 #include <type_traits>
 namespace infinicore {
@@ -24,6 +25,15 @@ inline void hash_combine(size_t &seed, Tensor tensor) {
    }
 }
+// Specialization for optional
+template <typename T>
+inline void hash_combine(size_t &seed, const std::optional<T> &opt) {
+    hash_combine(seed, opt.has_value());
+    if (opt) {
+        hash_combine(seed, *opt);
+    }
+}
 // Specialization for std::string
 inline void hash_combine(size_t &seed, const std::string &str) {
    hash_combine(seed, std::hash<std::string>{}(str));

--- a/include/infinicore/context/context.hpp
+++ b/include/infinicore/context/context.hpp
@@ -3,6 +3,8 @@
 #include "../device.hpp"
 #include "../memory.hpp"
+#include "../graph/graph.hpp"
 #include <infiniop.h>
 #include <infinirt.h>
@@ -40,6 +42,12 @@ void destroyEvent(infinirtEvent_t event);
 float elapsedTime(infinirtEvent_t start, infinirtEvent_t end);
 void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event);
+// Graph recording APIs
+bool isGraphRecording();
+void startGraphRecording();
+void addGraphOperator(std::shared_ptr<graph::GraphOperator> op);
+std::shared_ptr<graph::Graph> stopGraphRecording();
 } // namespace context
 } // namespace infinicore
--- a/include/infinicore/device.hpp
+++ b/include/infinicore/device.hpp
@@ -22,6 +22,7 @@ public:
        KUNLUN = INFINI_DEVICE_KUNLUN,
        HYGON = INFINI_DEVICE_HYGON,
        QY = INFINI_DEVICE_QY,
+        ALI = INFINI_DEVICE_ALI,
        COUNT = INFINI_DEVICE_TYPE_COUNT,
    };

--- a/include/infinicore/graph/graph.hpp
+++ b/include/infinicore/graph/graph.hpp
+#pragma once
+#include <memory>
+#include <vector>
+#include "../tensor.hpp"
+namespace infinicore::graph {
+// Forward declarations
+class GraphManager;
+class GraphTensor : public Tensor {
+public:
+    GraphTensor(const Tensor &);
+};
+class GraphOperator {
+public:
+    virtual void run() const = 0;
+    virtual ~GraphOperator() = default;
+};
+class DispatchableGraphOperator : public GraphOperator {
+public:
+    void run() const override;
+    ~DispatchableGraphOperator() override;
+protected:
+    using run_schema = void (*)(void *);
+    using cleanup_schema = void (*)(void **);
+    void *planned_meta_;
+    run_schema runner_;
+    cleanup_schema deleter_;
+};
+class Graph {
+public:
+    Graph();
+    ~Graph();
+    void run() const;
+protected:
+    void add_operator(std::shared_ptr<GraphOperator> op);
+    void instantiate();
+    std::vector<std::shared_ptr<GraphOperator>> op_list_;
+    friend class GraphManager;
+private:
+    struct DeviceGraph;
+    std::unique_ptr<DeviceGraph> device_graph_;
+};
+} // namespace infinicore::graph
+#define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...)                        \
+    class __OP_NAME__ : public graph::DispatchableGraphOperator {          \
+    public:                                                                \
+        using schema = void (*)(__VA_ARGS__);                              \
+        using plan_schema = void *(*)(__VA_ARGS__);                        \
+        static common::OpDispatcher<plan_schema> &plan_dispatcher();       \
+        static common::OpDispatcher<run_schema> &run_dispatcher();         \
+        static common::OpDispatcher<cleanup_schema> &cleanup_dispatcher(); \
+        __OP_NAME__(__VA_ARGS__);                                          \
+        static void execute(__VA_ARGS__);                                  \
+    };
+#define INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(__OP_NAME__)                                  \
+    common::OpDispatcher<__OP_NAME__::plan_schema> &__OP_NAME__::plan_dispatcher() {       \
+        static common::OpDispatcher<__OP_NAME__::plan_schema> dispatcher_;                 \
+        return dispatcher_;                                                                \
+    }                                                                                      \
+    common::OpDispatcher<__OP_NAME__::run_schema> &__OP_NAME__::run_dispatcher() {         \
+        static common::OpDispatcher<__OP_NAME__::run_schema> dispatcher_;                  \
+        return dispatcher_;                                                                \
+    }                                                                                      \
+    common::OpDispatcher<__OP_NAME__::cleanup_schema> &__OP_NAME__::cleanup_dispatcher() { \
+        static common::OpDispatcher<__OP_NAME__::cleanup_schema> dispatcher_;              \
+        return dispatcher_;                                                                \
+    }
+#define INFINICORE_GRAPH_OP_DISPATCH(__DEVICE_TYPE__, ...)                  \
+    planned_meta_ = plan_dispatcher().lookup(__DEVICE_TYPE__)(__VA_ARGS__); \
+    runner_ = run_dispatcher().lookup(__DEVICE_TYPE__);                     \
+    deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__);
+#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...)  \
+    auto ___op = std::make_shared<__OP_NAME__>(__VA_ARGS__); \
+    if (context::isGraphRecording()) {                       \
+        context::addGraphOperator(___op);                    \
+    } else {                                                 \
+        ___op->run();                                        \
+    }
+#define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \
+    static bool registered = []() {                                                               \
+        __OP_NAME__::plan_dispatcher().registerAll(__PLAN_F__, false);                            \
+        __OP_NAME__::run_dispatcher().registerAll(__RUN_F__, false);                              \
+        __OP_NAME__::cleanup_dispatcher().registerAll(__CLEANUP_F__, false);                      \
+        return true;                                                                              \
+    }();
--- a/include/infinicore/nn/linear.hpp
+++ b/include/infinicore/nn/linear.hpp
 #pragma once
 #include "../ops.hpp"
+#include "../quantization.hpp"
 #include "module.hpp"
 #include <infiniccl.h>
+#include <optional>
 namespace infinicore::nn {
@@ -11,6 +13,9 @@ public:
    BaseLinear(size_t in_features, size_t out_features, bool bias = true,
               const DataType &dtype = DataType::F32, const Device &device = Device());
+    BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+               const DataType &dtype = DataType::F32, const Device &device = Device());
    // Forward pass: output = input @ weight.T + bias
    Tensor forward(Tensor &input) const;
@@ -27,12 +32,17 @@ public:
    // Accessors for parameters
    Tensor weight() const { return weight_; }
    Tensor bias() const { return bias_; }
+    Tensor weight_scale() const { return weight_scale_; }
+    Tensor weight_zeros() const { return weight_zeros_; }
 protected:
    // Parameters
    INFINICORE_NN_PARAMETER(weight);
    INFINICORE_NN_PARAMETER(bias);
+    INFINICORE_NN_PARAMETER(weight_scale);
+    INFINICORE_NN_PARAMETER(weight_zeros);
 protected:
    // Helper method for common forward computation
    Tensor compute_linear(Tensor &input) const;
@@ -41,6 +51,7 @@ protected:
    size_t out_features_;
    bool has_bias_;
    DataType dtype_;
+    std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_ = std::make_shared<infinicore::quantization::NoneQuantization>(nullptr);
 };
 } // namespace infinicore::nn
@@ -52,6 +63,9 @@ public:
    Linear(size_t in_features, size_t out_features, bool bias = true,
           const DataType &dtype = DataType::F32, const Device &device = Device());
+    Linear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+           const DataType &dtype = DataType::F32, const Device &device = Device());
    // Forward pass: output = input @ weight.T + bias
    Tensor forward(Tensor &input) const;
@@ -65,6 +79,10 @@ public:
                         const DataType &dtype = DataType::F32, const Device &device = Device(),
                         Size tp_rank = 0, Size tp_size = 1);
+    ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+                         const DataType &dtype = DataType::F32, const Device &device = Device(),
+                         Size tp_rank = 0, Size tp_size = 1);
    // Forward pass: output = input @ weight.T + bias
    Tensor forward(Tensor &input) const;
@@ -82,6 +100,10 @@ public:
                      const DataType &dtype = DataType::F32, const Device &device = Device(),
                      Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
+    RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+                      const DataType &dtype = DataType::F32, const Device &device = Device(),
+                      Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
    // Forward pass: output = input @ weight.T + bias
    Tensor forward(Tensor &input) const;

--- a/include/infinicore/nn/rmsnorm.hpp
+++ b/include/infinicore/nn/rmsnorm.hpp
 #pragma once
-#include "module.hpp"
 #include "../ops.hpp"
+#include "module.hpp"
 namespace infinicore::nn {
@@ -57,6 +57,21 @@ public:
     */
    Tensor forward(const Tensor &x) const;
+    /**
+     * @brief Forward pass: apply RMSNorm in-place with residual
+     *
+     * @param x Input tensor of shape (*, normalized_shape) where * is any number of dimensions.
+     *       Will be modified in-place to the normalized output.
+     * @param residual Residual tensor to add to input before normalization.
+     *       Will be modified in-place to the sum of input and residual.
+     *
+     * The normalization is applied over the last dimension.
+     * For example:
+     *   Input: [batch, seq_len, hidden_size] -> normalize over hidden_size
+     *   Input: [batch, hidden_size] -> normalize over hidden_size
+     */
+    void forward_inplace(Tensor &x, Tensor &residual) const;
    // Module information
    size_t normalized_shape() const { return normalized_shape_; }
    double eps() const { return eps_; }
@@ -73,9 +88,9 @@ protected:
    INFINICORE_NN_PARAMETER(weight);
 private:
-    size_t normalized_shape_;  // Size of the feature dimension
+    size_t normalized_shape_; // Size of the feature dimension
-    double eps_;               // Epsilon for numerical stability
+    double eps_;              // Epsilon for numerical stability
-    DataType dtype_;           // Data type for weight
+    DataType dtype_;          // Data type for weight
 };
 } // namespace infinicore::nn
--- a/include/infinicore/nn/rope.hpp
+++ b/include/infinicore/nn/rope.hpp
@@ -17,6 +17,47 @@ public:
        GPT_NEOX = 1, // GPT-NeoX style RoPE algorithm (First half dimensions for sin, second half for cos)
    };
+    enum class ScalingType {
+        DEFAULT = 0, // Default RoPE
+        LONGROPE = 1 // Long-RoPE
+    };
+    class ScalingConfig {
+    public:
+        virtual ~ScalingConfig() = default;
+        ScalingType type() const { return type_; }
+    protected:
+        ScalingType type_ = ScalingType::DEFAULT;
+        ScalingConfig(ScalingType type) : type_(type) {}
+    };
+    // longrope scaling
+    class LongRopeConfig : public ScalingConfig {
+    protected:
+        std::vector<float> short_factor_;
+        std::vector<float> long_factor_;
+        size_t original_max_position_embeddings_;
+        float factor_;
+    public:
+        LongRopeConfig(
+            std::vector<float> short_factor,
+            std::vector<float> long_factor,
+            size_t original_max_position_embeddings,
+            float factor = 1.0f)
+            : ScalingConfig(ScalingType::LONGROPE),
+              short_factor_(short_factor),
+              long_factor_(long_factor),
+              original_max_position_embeddings_(original_max_position_embeddings),
+              factor_(factor == 1.0f ? 1.0f : std::sqrt(1 + std::log(factor) / std::log(original_max_position_embeddings))) {}
+        ~LongRopeConfig() override = default;
+        size_t original_max_position_embeddings() const { return original_max_position_embeddings_; }
+        const std::vector<float> &short_factor() const { return short_factor_; }
+        const std::vector<float> &long_factor() const { return long_factor_; }
+        float factor() const { return factor_; }
+    };
    /**
     * @brief Construct a RoPE layer
     *
@@ -26,13 +67,15 @@ public:
     * @param algo RoPE algorithm type (default: Algo::GPT_J)
     * @param dtype Data type for sin/cos cache (default: DataType::F32)
     * @param device Device to create the cache on
+     * @param scaling RoPE scaling type (default: nullptr)
     */
    RoPE(size_t head_dim,
         size_t max_seq_len,
         double theta = 10000.0,
         Algo algo = Algo::GPT_J,
         const DataType &dtype = DataType::F32,
-         const Device &device = Device());
+         const Device &device = Device(),
+         std::shared_ptr<ScalingConfig> scaling = nullptr);
    /**
     * @brief Forward pass: apply RoPE to a tensor
@@ -88,11 +131,12 @@ protected:
 private:
    void initialize_cache();
-    size_t head_dim_;    // Dimension of each attention head
+    size_t head_dim_;                        // Dimension of each attention head
-    size_t max_seq_len_; // Maximum sequence length
+    size_t max_seq_len_;                     // Maximum sequence length
-    double theta_;       // Base frequency for rotary embeddings
+    double theta_;                           // Base frequency for rotary embeddings
-    Algo algo_;          // RoPE algorithm type
+    Algo algo_;                              // RoPE algorithm type
-    DataType dtype_;     // Data type for cache tables
+    DataType dtype_;                         // Data type for cache tables
+    std::shared_ptr<ScalingConfig> scaling_; // RoPE scaling type
 };
 } // namespace infinicore::nn
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
 #pragma once
 #include "ops/add.hpp"
+#include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
+#include "ops/embedding.hpp"
+#include "ops/flash_attention.hpp"
+#include "ops/kv_caching.hpp"
 #include "ops/matmul.hpp"
 #include "ops/ones.hpp"
+#include "ops/paged_attention.hpp"
+#include "ops/paged_attention_prefill.hpp"
+#include "ops/paged_caching.hpp"
+#include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
+#include "ops/silu_and_mul.hpp"
 #include "ops/swiglu.hpp"
--- a/include/infinicore/ops/add.hpp
+++ b/include/infinicore/ops/add.hpp
 #pragma once
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 namespace infinicore::op {
-class Add {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor);
-    static void execute(Tensor c, Tensor a, Tensor b);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-Tensor add(Tensor a, Tensor b);
+INFINICORE_GRAPH_OP_CLASS(Add, Tensor, const Tensor &, const Tensor &);
-void add_(Tensor c, Tensor a, Tensor b);
-Tensor operator+(Tensor a, Tensor b);
+Tensor add(const Tensor &a, const Tensor &b);
+void add_(Tensor c, const Tensor &a, const Tensor &b);
 } // namespace infinicore::op
--- a/include/infinicore/ops/add_rms_norm.hpp
+++ b/include/infinicore/ops/add_rms_norm.hpp
+#pragma once
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <utility>
+namespace infinicore::op {
+INFINICORE_GRAPH_OP_CLASS(AddRMSNorm, Tensor, Tensor, const Tensor &, const Tensor &, const Tensor &, float);
+// Fused Add and RMS Normalization
+// Returns: (normalized_result, add_result)
+// The add_result can be used as residual for subsequent layers
+std::pair<Tensor, Tensor> add_rms_norm(const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon = 1e-5f);
+void add_rms_norm_(Tensor out, Tensor residual, const Tensor &a, const Tensor &b, const Tensor &weight, float epsilon = 1e-5f);
+// Fused Add and RMS Normalization (inplace)
+// normalized_result wil be stored in input, add_result will be stored in residual
+void add_rms_norm_inplace(Tensor input, Tensor residual, const Tensor &weight, float epsilon = 1e-5f);
+} // namespace infinicore::op
--- a/include/infinicore/ops/causal_softmax.hpp
+++ b/include/infinicore/ops/causal_softmax.hpp
 #pragma once
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 namespace infinicore::op {
-class CausalSoftmax {
-public:
-    using schema = void (*)(Tensor, Tensor);
-    static void execute(Tensor output, Tensor input);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-Tensor causal_softmax(Tensor input);
+INFINICORE_GRAPH_OP_CLASS(CausalSoftmax, Tensor, const Tensor &);
-void causal_softmax_(Tensor output, Tensor input);
+Tensor causal_softmax(const Tensor &input);
+void causal_softmax_(Tensor output, const Tensor &input);
 } // namespace infinicore::op
--- a/include/infinicore/ops/dequantize_awq.hpp
+++ b/include/infinicore/ops/dequantize_awq.hpp
+#pragma once
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+namespace infinicore::op {
+INFINICORE_GRAPH_OP_CLASS(DequantizeAWQ, Tensor, const Tensor &, const Tensor &, const Tensor &);
+void dequantize_awq_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros);
+} // namespace infinicore::op
--- a/include/infinicore/ops/distributed/allreduce.hpp
+++ b/include/infinicore/ops/distributed/allreduce.hpp
+#pragma once
+#include "../../device.hpp"
+#include "../../graph/graph.hpp"
+#include "../common/op.hpp"
+#include <infiniccl.h>
+namespace infinicore::op::distributed {
+class AllReduce : public graph::GraphOperator {
+public:
+    AllReduce(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator);
+    ~AllReduce();
+    void run() const override;
+    static void execute(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator);
+private:
+    void *planned_meta_;
+};
+Tensor allreduce(const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator);
+void allreduce_(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator);
+} // namespace infinicore::op::distributed
--- a/include/infinicore/ops/embedding.hpp
+++ b/include/infinicore/ops/embedding.hpp
 #pragma once
+#include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 namespace infinicore::op {
-Tensor embedding(Tensor input, Tensor weight);
+INFINICORE_GRAPH_OP_CLASS(Embedding, Tensor, const Tensor &, const Tensor &);
-void embedding_(Tensor out, Tensor input, Tensor weight);
+Tensor embedding(const Tensor &input, const Tensor &weight);
+void embedding_(Tensor out, const Tensor &input, const Tensor &weight);
 } // namespace infinicore::op
--- a/include/infinicore/ops/flash_attention.hpp
+++ b/include/infinicore/ops/flash_attention.hpp
+#pragma once
+#include "../device.hpp"
+#include "common/op.hpp"
+namespace infinicore::op {
+INFINICORE_GRAPH_OP_CLASS(FlashAttention, Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, float, bool);
+Tensor flash_attention(const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal);
+void flash_attention_(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal);
+} // namespace infinicore::op
--- a/include/infinicore/ops/gemm.hpp
+++ b/include/infinicore/ops/gemm.hpp
 #pragma once
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 namespace infinicore::op {
-class Gemm {
+INFINICORE_GRAPH_OP_CLASS(Gemm, Tensor, const Tensor &, const Tensor &, float, float);
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor, float, float);
-    static void execute(Tensor c, Tensor a, Tensor b, float alpha, float beta);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-Tensor gemm(Tensor a, Tensor b, float alpha = 1.0f, float beta = 0.0f);
+Tensor gemm(const Tensor &a, const Tensor &b, float alpha = 1.0f, float beta = 0.0f);
-void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta);
+void gemm_(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta);
 } // namespace infinicore::op