Merge branch 'demo131' into Issue/862

8d09630a · gongchensu · GitHub · ab52dead · 012df56c · 8d09630a
Unverified Commit 8d09630a authored Feb 11, 2026 by gongchensu Committed by GitHub Feb 11, 2026
20 changed files
--- a/include/infinicore/tensor.hpp
+++ b/include/infinicore/tensor.hpp
@@ -133,6 +133,19 @@ public:
    void debug() const;
+    /**
+     * Unsafe API that returns a new tensor with the same raw memory untracked by allocator
+     * This API is used for loosely tracking a piece of memory while allowing it to be reused,
+     * typically in a compute graph scenario.
+     */
+    Tensor to_blob_() const;
+    /**
+     * Unsafe API that returns a new tensor with the same memory and let allocator retracks the memory.
+     * Should only be used on the tensor returned by to_blob_().
+     */
+    Tensor resume_from_blob_() const;
    ///
    /// Data Transfer APIs
    ///
@@ -294,9 +307,13 @@ protected:
    friend class Tensor;
-private:
+protected:
    TensorMetaData meta_;
    TensorData data_;
+private:
+    // Mark to indicate if the tensor is created from to_blob_()
+    bool to_blob_mark_ = false;
 };
 } // namespace infinicore
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -3,18 +3,27 @@
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
+#include "infiniop/ops/add_rms_norm.h"
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/dequantize_awq.h"
+#include "infiniop/ops/embedding.h"
+#include "infiniop/ops/flash_attention.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/int8_gemm.h"
+#include "infiniop/ops/kv_caching.h"
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/ones.h"
+#include "infiniop/ops/paged_attention.h"
+#include "infiniop/ops/paged_attention_prefill.h"
+#include "infiniop/ops/paged_caching.h"
+#include "infiniop/ops/quant/per_channel_quant_int8.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/relu.h"
@@ -22,6 +31,7 @@
 #include "infiniop/ops/rope.h"
 #include "infiniop/ops/sigmoid.h"
 #include "infiniop/ops/silu.h"
+#include "infiniop/ops/silu_and_mul.h"
 #include "infiniop/ops/softmax.h"
 #include "infiniop/ops/softplus.h"
 #include "infiniop/ops/sub.h"

--- a/include/infiniop/ops/add_rms_norm.h
+++ b/include/infiniop/ops/add_rms_norm.h
+#ifndef __INFINIOP_ADD_RMS_NORM_API_H__
+#define __INFINIOP_ADD_RMS_NORM_API_H__
+#include "../operator_descriptor.h"
+typedef struct InfiniopDescriptor *infiniopAddRMSNormDescriptor_t;
+__C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopAddRMSNormDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t residual_out_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    float epsilon);
+__C __export infiniStatus_t infiniopGetAddRMSNormWorkspaceSize(infiniopAddRMSNormDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *y,
+                                                void *residual_out,
+                                                const void *a,
+                                                const void *b,
+                                                const void *weight,
+                                                void *stream);
+__C __export infiniStatus_t infiniopDestroyAddRMSNormDescriptor(infiniopAddRMSNormDescriptor_t desc);
+#endif
--- a/include/infiniop/ops/embedding.h
+++ b/include/infiniop/ops/embedding.h
+#ifndef __INFINIOP_EMBEDDING_API_H__
+#define __INFINIOP_EMBEDDING_API_H__
+#include "../operator_descriptor.h"
+typedef struct InfiniopDescriptor *infiniopEmbeddingDescriptor_t;
+__C __export infiniStatus_t infiniopCreateEmbeddingDescriptor(
+    infiniopHandle_t handle,
+    infiniopEmbeddingDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc);
+__C __export infiniStatus_t infiniopEmbedding(
+    infiniopEmbeddingDescriptor_t desc,
+    void *output,
+    const void *input,
+    const void *weight,
+    void *stream);
+__C __export infiniStatus_t infiniopDestroyEmbeddingDescriptor(
+    infiniopEmbeddingDescriptor_t desc);
+#endif
--- a/include/infiniop/ops/flash_attention.h
+++ b/include/infiniop/ops/flash_attention.h
+#ifndef __INFINIOP_FLASH_ATTENTION_API_H__
+#define __INFINIOP_FLASH_ATTENTION_API_H__
+#include "../operator_descriptor.h"
+typedef struct InfiniopDescriptor *infiniopFlashAttentionDescriptor_t;
+__C __export infiniStatus_t infiniopCreateFlashAttentionDescriptor(
+    infiniopHandle_t handle,
+    infiniopFlashAttentionDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t q_desc,
+    infiniopTensorDescriptor_t k_desc,
+    infiniopTensorDescriptor_t v_desc,
+    infiniopTensorDescriptor_t total_kv_len,
+    float scale,
+    char is_causal);
+__C __export infiniStatus_t infiniopGetFlashAttentionWorkspaceSize(
+    infiniopFlashAttentionDescriptor_t desc,
+    size_t *size);
+__C __export infiniStatus_t infiniopFlashAttention(
+    infiniopFlashAttentionDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *q,
+    const void *k,
+    const void *v,
+    const void *total_kv_len,
+    void *stream);
+__C __export infiniStatus_t infiniopDestroyFlashAttentionDescriptor(
+    infiniopFlashAttentionDescriptor_t desc);
+#endif
--- a/include/infiniop/ops/int8_gemm.h
+++ b/include/infiniop/ops/int8_gemm.h
+#ifndef __INFINIOP_I8GEMM_API_H__
+#define __INFINIOP_I8GEMM_API_H__
+#include "../operator_descriptor.h"
+typedef InfiniopDescriptor *infiniopI8GemmDescriptor_t;
+__C __export infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
+                                                           infiniopI8GemmDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t out_desc,
+                                                           infiniopTensorDescriptor_t bias_desc,
+                                                           infiniopTensorDescriptor_t x_desc,
+                                                           infiniopTensorDescriptor_t x_scale_desc,
+                                                           infiniopTensorDescriptor_t weights_desc,
+                                                           infiniopTensorDescriptor_t weights_scale_desc);
+__C __export infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *out,
+                                           const void *bias,
+                                           const void *x,
+                                           const void *x_scale,
+                                           const void *weights,
+                                           const void *weights_scale,
+                                           void *stream);
+__C __export infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t desc);
+#endif
--- a/include/infiniop/ops/kv_caching.h
+++ b/include/infiniop/ops/kv_caching.h
+#ifndef __INFINIOP_KV_CACHING_API_H__
+#define __INFINIOP_KV_CACHING_API_H__
+#include "../operator_descriptor.h"
+typedef struct InfiniopDescriptor *infiniopKVCachingDescriptor_t;
+__C __export infiniStatus_t infiniopCreateKVCachingDescriptor(
+    infiniopHandle_t handle,
+    infiniopKVCachingDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t k_cache,
+    infiniopTensorDescriptor_t v_cache,
+    infiniopTensorDescriptor_t k,
+    infiniopTensorDescriptor_t v,
+    infiniopTensorDescriptor_t past_kv_lengths);
+__C __export infiniStatus_t infiniopGetKVCachingWorkspaceSize(infiniopKVCachingDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopKVCaching(infiniopKVCachingDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *k_cache,
+                                              void *v_cache,
+                                              const void *k,
+                                              const void *v,
+                                              const void *past_kv_lengths,
+                                              void *stream);
+__C __export infiniStatus_t infiniopDestroyKVCachingDescriptor(infiniopKVCachingDescriptor_t desc);
+#endif
--- a/include/infiniop/ops/paged_attention.h
+++ b/include/infiniop/ops/paged_attention.h
+#ifndef __INFINIOP_PAGED_ATTENTION_API_H__
+#define __INFINIOP_PAGED_ATTENTION_API_H__
+#include "../operator_descriptor.h"
+// Define an opaque handle for the Paged Attention descriptor.
+typedef struct InfiniopDescriptor *infiniopPagedAttentionDescriptor_t;
+/**
+ * @brief Creates a descriptor for the Paged Attention v1 operation.
+ *
+ * @param handle    The library context handle.
+ * @param desc_ptr  Pointer to the created descriptor.
+ * @param out_desc  [Output] Shape: (num_seqs, num_heads, head_size).
+ * The output tensor for the attention mechanism.
+ * @param q_desc    [Input]  Shape: (num_seqs, num_heads, head_size).
+ * The query tensor.
+ * @param k_cache_desc [Input] Shape: (num_blocks, num_kv_heads, block_size, head_size).
+ * Paged key cache storing keys for all sequences.
+ * @param v_cache_desc [Input] Shape: (num_blocks, num_kv_heads, block_size, head_size).
+ * Paged value cache storing values for all sequences.
+ * @param block_tables_desc [Input] Shape: (num_seqs, max_num_blocks_per_seq).
+ * Maps each sequence to its physical block indices in the cache.
+ * Expected DType: int64_t (I64).
+ * @param seq_lens_desc [Input] Shape: (num_seqs,).
+ * The current logical length of each sequence.
+ * Expected DType: int64_t (I64).
+ * @param alibi_slopes_desc [Optional] Shape: (num_heads,).
+ * Slopes for ALiBi (Attention with Linear Biases). Can be NULL.
+ * @param scale     The attention scaling factor (typically 1/sqrt(head_size)).
+ * @return infiniStatus_t Status code.
+ */
+__C __export infiniStatus_t infiniopCreatePagedAttentionDescriptor(
+    infiniopHandle_t handle,
+    infiniopPagedAttentionDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t q_desc,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t block_tables_desc,
+    infiniopTensorDescriptor_t seq_lens_desc,
+    infiniopTensorDescriptor_t alibi_slopes_desc,
+    float scale);
+/**
+ * @brief Retrieves the workspace size required for the Paged Attention operation.
+ *
+ * @param desc The Paged Attention descriptor.
+ * @param size A pointer to store the required workspace size in bytes.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopGetPagedAttentionWorkspaceSize(
+    infiniopPagedAttentionDescriptor_t desc, size_t *size);
+/**
+ * @brief Executes the Paged Attention v1 operation.
+ *
+ * @param desc The Paged Attention descriptor.
+ * @param workspace Pointer to the workspace memory.
+ * @param workspace_size The size of the workspace.
+ * @param out Pointer to the output tensor data.
+ * @param q Pointer to the query tensor data.
+ * @param k_cache Pointer to the key cache data.
+ * @param v_cache Pointer to the value cache data.
+ * @param block_tables Pointer to the block tables data.
+ * @param seq_lens Pointer to the sequence lengths data.
+ * @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
+ * @param stream The CUDA stream for the operation. Can be NULL.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopPagedAttention(
+    infiniopPagedAttentionDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *q,
+    const void *k_cache,
+    const void *v_cache,
+    const void *block_tables,
+    const void *seq_lens,
+    const void *alibi_slopes,
+    void *stream);
+/**
+ * @brief Destroys a Paged Attention descriptor.
+ *
+ * @param desc The descriptor to be destroyed.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopDestroyPagedAttentionDescriptor(
+    infiniopPagedAttentionDescriptor_t desc);
+#endif // __INFINIOP_PAGED_ATTENTION_API_H__
--- a/include/infiniop/ops/paged_attention_prefill.h
+++ b/include/infiniop/ops/paged_attention_prefill.h
+#ifndef __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
+#define __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
+#include "../operator_descriptor.h"
+// Define an opaque handle for the Paged Attention Prefill descriptor.
+typedef struct InfiniopDescriptor *infiniopPagedAttentionPrefillDescriptor_t;
+/**
+ * @brief Creates a descriptor for the Paged Attention Prefill operation.
+ * @param handle The handle to the InfiniOP library context.
+ * @param desc_ptr A pointer to store the created descriptor.
+ * @param out_desc Descriptor for the output tensor.
+ * Shape: [total_q_tokens, num_heads, head_size]
+ * @param q_desc Descriptor for the query tensor (packed/flattened).
+ * Shape: [total_q_tokens, num_heads, head_size]
+ * @param k_cache_desc Descriptor for the global physical key cache.
+ * Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
+ * @param v_cache_desc Descriptor for the global physical value cache.
+ * Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
+ * @param block_tables_desc Descriptor for the block tables mapping logic to physical blocks.
+ * Shape: [batch_size, max_blocks_per_seq]
+ * @param seq_lens_desc Descriptor for the total KV lengths of each sequence.
+ * Shape: [batch_size]
+ * @param cum_seq_lens_q_desc Descriptor for the cumulative start position (prefix sum) of each Q sequence.
+ * Shape: [batch_size + 1]
+ * @param alibi_slopes_desc Optional descriptor for the ALiBi slopes tensor. Can be NULL.
+ * Shape: [num_heads]
+ * @param scale The attention scaling factor (typically 1.0 / sqrt(head_size)).
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
+    infiniopHandle_t handle,
+    infiniopPagedAttentionPrefillDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t q_desc,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t block_tables_desc,
+    infiniopTensorDescriptor_t seq_lens_desc,
+    infiniopTensorDescriptor_t cum_seq_lens_q_desc,
+    infiniopTensorDescriptor_t alibi_slopes_desc,
+    float scale);
+/**
+ * @brief Retrieves the workspace size required for the Paged Attention Prefill operation.
+ */
+__C __export infiniStatus_t infiniopGetPagedAttentionPrefillWorkspaceSize(
+    infiniopPagedAttentionPrefillDescriptor_t desc, size_t *size);
+/**
+ * @brief Executes the Paged Attention Prefill operation.
+ * @param desc The Paged Attention Prefill descriptor.
+ * @param workspace Pointer to the workspace memory.
+ * @param workspace_size The size of the workspace.
+ * @param out Pointer to the output tensor data.
+ * @param q Pointer to the query tensor data (packed).
+ * @param k_cache Pointer to the global key cache data.
+ * @param v_cache Pointer to the global value cache data.
+ * @param block_tables Pointer to the block tables data.
+ * @param seq_lens Pointer to the KV lengths data.
+ * @param cum_seq_lens_q Pointer to the Q cumulative sequence lengths data (prefix sum).
+ * @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
+ * @param stream The device stream (e.g., cudaStream_t) for the operation.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopPagedAttentionPrefill(
+    infiniopPagedAttentionPrefillDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *q,
+    const void *k_cache,
+    const void *v_cache,
+    const void *block_tables,
+    const void *seq_lens,
+    const void *cum_seq_lens_q,
+    const void *alibi_slopes,
+    void *stream);
+/**
+ * @brief Destroys a Paged Attention Prefill descriptor.
+ */
+__C __export infiniStatus_t infiniopDestroyPagedAttentionPrefillDescriptor(
+    infiniopPagedAttentionPrefillDescriptor_t desc);
+#endif // __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
--- a/include/infiniop/ops/paged_caching.h
+++ b/include/infiniop/ops/paged_caching.h
+#ifndef __INFINIOP_PAGED_CACHING_API_H__
+#define __INFINIOP_PAGED_CACHING_API_H__
+#include "../operator_descriptor.h"
+// Define an opaque handle for the Paged Caching descriptor.
+typedef struct InfiniopDescriptor *infiniopPagedCachingDescriptor_t;
+/**
+ * @brief Creates a descriptor for the Paged Caching operation.
+ *
+ * This function initializes a descriptor that holds all the metadata needed
+ * to copy key/value vectors into their respective cache pools.
+ *
+ * @param handle The handle to the InfiniOP library context.
+ * @param desc_ptr A pointer to store the created descriptor.
+ * @param k_cache_desc Descriptor for the key cache pool tensor.
+ * @param v_cache_desc Descriptor for the value cache pool tensor.
+ * @param k_desc Descriptor for the source key tensor.
+ * @param v_desc Descriptor for the source value tensor.
+ * @param slot_mapping_desc Descriptor for the slot mapping tensor.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopCreatePagedCachingDescriptor(
+    infiniopHandle_t handle,
+    infiniopPagedCachingDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t k_desc,
+    infiniopTensorDescriptor_t v_desc,
+    infiniopTensorDescriptor_t slot_mapping_desc);
+/**
+ * @brief Retrieves the workspace size required for the Paged Caching operation.
+ *
+ * @param desc The Paged Caching descriptor.
+ * @param size A pointer to store the required workspace size in bytes (typically 0).
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopGetPagedCachingWorkspaceSize(
+    infiniopPagedCachingDescriptor_t desc, size_t *size);
+/**
+ * @brief Executes the Paged Caching operation.
+ *
+ * @param desc The Paged Caching descriptor.
+ * @param workspace Pointer to the workspace memory.
+ * @param workspace_size The size of the workspace.
+ * @param k_cache Pointer to the key cache pool data.
+ * @param v_cache Pointer to the value cache pool data.
+ * @param k Pointer to the source key tensor data.
+ * @param v Pointer to the source value tensor data.
+ * @param slot_mapping Pointer to the slot mapping data.
+ * @param stream The CUDA stream for the operation. Can be NULL.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopPagedCaching(
+    infiniopPagedCachingDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *k_cache,
+    void *v_cache,
+    const void *k,
+    const void *v,
+    const void *slot_mapping,
+    void *stream);
+/**
+ * @brief Destroys a Paged Caching descriptor.
+ *
+ * @param desc The descriptor to be destroyed.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopDestroyPagedCachingDescriptor(
+    infiniopPagedCachingDescriptor_t desc);
+#endif // __INFINIOP_PAGED_CACHING_API_H__
--- a/include/infiniop/ops/quant/per_channel_quant_int8.h
+++ b/include/infiniop/ops/quant/per_channel_quant_int8.h
+#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+#include "../../operator_descriptor.h"
+typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;
+__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
+                                                                      infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
+                                                                      infiniopTensorDescriptor_t x_packed_desc,
+                                                                      infiniopTensorDescriptor_t x_scale_desc,
+                                                                      infiniopTensorDescriptor_t x_zero_desc,
+                                                                      infiniopTensorDescriptor_t x_desc);
+__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
+                                                      void *workspace,
+                                                      size_t workspace_size,
+                                                      void *x_packed,
+                                                      void *x_scale,
+                                                      void *x_zero,
+                                                      const void *x,
+                                                      void *stream);
+__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);
+#endif
--- a/include/infiniop/ops/silu_and_mul.h
+++ b/include/infiniop/ops/silu_and_mul.h
+#ifndef __INFINIOP_SILU_AND_MUL_API_H__
+#define __INFINIOP_SILU_AND_MUL_API_H__
+#include "../operator_descriptor.h"
+/**
+ * @brief Opaque handle for the SiluAndMul descriptor.
+ */
+typedef struct InfiniopDescriptor *infiniopSiluAndMulDescriptor_t;
+/**
+ * @brief Creates a descriptor for the SiLU and Multiply (SiluAndMul) operation.
+ *
+ * Format: (input_shape, output_shape)
+ * Referencing vLLM kernel SiluAndMul interface:
+ * - input_shape is [..., 2*d]  (last dimension is split into two halves for SiLU and multiplication)
+ * - output_shape is [..., d]   (last dimension reduced to half)
+ *
+ * @param handle The handle to the InfiniOP library context.
+ * @param desc_ptr A pointer to store the created descriptor.
+ * @param output Descriptor for the output tensor. Shape [..., d].
+ * @param input Descriptor for the input tensor. Shape [..., 2*d].
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopCreateSiluAndMulDescriptor(
+    infiniopHandle_t handle,
+    infiniopSiluAndMulDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input);
+/**
+ * @brief Queries the workspace size required for SiluAndMul computation.
+ * @param desc The SiluAndMul descriptor.
+ * @param size Pointer to store the required workspace size in bytes.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopGetSiluAndMulWorkspaceSize(
+    infiniopSiluAndMulDescriptor_t desc,
+    size_t *size);
+/**
+ * @brief Executes the SiluAndMul operation.
+ *
+ * Performs SiLU activation on the first half of the last dimension of `input`,
+ * multiplies element-wise with the second half, and stores the result in `output`.
+ *
+ * @param desc The SiluAndMul descriptor.
+ * @param workspace Pointer to workspace memory allocated according to GetWorkspaceSize().
+ * @param workspace_size Size of the workspace in bytes.
+ * @param output Pointer to the output tensor memory. Shape [..., d].
+ * @param input Pointer to the input tensor memory. Shape [..., 2*d].
+ * @param stream Pointer to the execution stream (e.g., CUDA stream). Can be NULL for default stream.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopSiluAndMul(
+    infiniopSiluAndMulDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+/**
+ * @brief Destroys a previously created SiluAndMul descriptor.
+ * @param desc The descriptor to destroy.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopDestroySiluAndMulDescriptor(
+    infiniopSiluAndMulDescriptor_t desc);
+#endif // __INFINIOP_SILU_AND_MUL_API_H__
--- a/include/infinirt.h
+++ b/include/infinirt.h
@@ -6,6 +6,9 @@
 typedef void *infinirtStream_t;
 typedef void *infinirtEvent_t;
+typedef void *infinirtGraph_t;
+typedef void *infinirtGraphNode_t;
+typedef void *infinirtGraphExec_t;
 __C __export infiniStatus_t infinirtInit();
@@ -63,4 +66,24 @@ __C __export infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size
 __C __export infiniStatus_t infinirtMallocAsync(void **p_ptr, size_t size, infinirtStream_t stream);
 __C __export infiniStatus_t infinirtFreeAsync(void *ptr, infinirtStream_t stream);
+// Graph
+typedef enum {
+    INFINIRT_STREAM_CAPTURE_MODE_GLOBAL = 0,
+    INFINIRT_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
+    INFINIRT_STREAM_CAPTURE_MODE_RELAXED = 2,
+} infinirtStreamCaptureMode_t;
+__C __export infiniStatus_t infinirtStreamBeginCapture(infinirtStream_t stream, infinirtStreamCaptureMode_t mode);
+__C __export infiniStatus_t infinirtStreamEndCapture(infinirtStream_t stream, infinirtGraph_t *graph_ptr);
+__C __export infiniStatus_t infinirtGraphDestroy(infinirtGraph_t graph);
+__C __export infiniStatus_t infinirtGraphInstantiate(
+    infinirtGraphExec_t *graph_exec_ptr,
+    infinirtGraph_t graph,
+    infinirtGraphNode_t *node_ptr,
+    char *log_buffer,
+    size_t buffer_size);
+__C __export infiniStatus_t infinirtGraphExecDestroy(infinirtGraphExec_t graph_exec);
+__C __export infiniStatus_t infinirtGraphLuanch(infinirtGraphExec_t graph_exec, infinirtStream_t stream);
 #endif // __INFINIRT_API_H__
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -8,7 +8,10 @@ from infinicore.context import (
    get_device,
    get_device_count,
    get_stream,
+    is_graph_recording,
    set_device,
+    start_graph_recording,
+    stop_graph_recording,
    sync_device,
    sync_stream,
 )
@@ -40,10 +43,15 @@ from infinicore.dtype import (
    uint8,
 )
 from infinicore.ops.add import add
+from infinicore.ops.add_rms_norm import add_rms_norm
 from infinicore.ops.attention import attention
+from infinicore.ops.kv_caching import kv_caching
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
+from infinicore.ops.paged_attention import paged_attention
+from infinicore.ops.paged_attention_prefill import paged_attention_prefill
+from infinicore.ops.paged_caching import paged_caching
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.unsqueeze import unsqueeze
@@ -77,6 +85,9 @@ __all__ = [
    "set_device",
    "sync_device",
    "sync_stream",
+    "is_graph_recording",
+    "start_graph_recording",
+    "stop_graph_recording",
    # Data Types.
    "bfloat16",
    "bool",
@@ -102,7 +113,10 @@ __all__ = [
    "uint8",
    # Operations.
    "add",
+    "add_rms_norm",
+    "add_rms_norm_",
    "attention",
+    "kv_caching",
    "matmul",
    "mul",
    "narrow",
@@ -115,6 +129,9 @@ __all__ = [
    "from_list",
    "from_numpy",
    "from_torch",
+    "paged_caching",
+    "paged_attention",
+    "paged_attention_prefill",
    "ones",
    "strided_empty",
    "strided_from_blob",

--- a/python/infinicore/context.py
+++ b/python/infinicore/context.py
 import infinicore.device
+from infinicore.graph import Graph
 from infinicore.lib import _infinicore
@@ -49,3 +50,24 @@ def get_stream():
        stream: The current stream object
    """
    return _infinicore.get_stream()
+def is_graph_recording():
+    """Check if the current graph is recording.
+    Returns:
+        bool: True if the current graph is recording, False otherwise
+    """
+    return _infinicore.is_graph_recording()
+def start_graph_recording(device=None):
+    """Start recording the current graph."""
+    if device is not None:
+        set_device(device)
+    _infinicore.start_graph_recording()
+def stop_graph_recording():
+    """Stop recording the current graph."""
+    return Graph(_infinicore.stop_graph_recording())
--- a/python/infinicore/device.py
+++ b/python/infinicore/device.py
@@ -34,7 +34,10 @@ class device:
    def __getattr__(self, name):
        # Lazily construct and cache an attribute.
        # such as, self._underlying .
-        setattr(self, name, device._to_infinicore_device(self.type, self.index))
+        if name == "_underlying":
+            setattr(self, name, device._to_infinicore_device(self.type, self.index))
+        else:
+            raise AttributeError("{!r} object has no attribute {!r}".format(self, name))
        return getattr(self, name)
    def __repr__(self):
@@ -79,6 +82,7 @@ _TORCH_DEVICE_MAP = {
    _infinicore.Device.Type.KUNLUN: "cuda",
    _infinicore.Device.Type.HYGON: "cuda",
    _infinicore.Device.Type.QY: "cuda",
+    _infinicore.Device.Type.ALI: "cuda",
 }

--- a/python/infinicore/graph.py
+++ b/python/infinicore/graph.py
+from infinicore.lib import _infinicore
+class Graph:
+    """
+    Python wrapper around a InfiniCore Graph instance.
+    """
+    def __init__(self, graph: _infinicore.Graph):
+        if not isinstance(graph, _infinicore.Graph):
+            raise TypeError("Expected _infinicore.Graph")
+        self._graph = graph
+    def run(self):
+        return self._graph.run()
+    def __repr__(self):
+        return f"<Graph wrapper of {self._graph!r}>"
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
 from .causal_softmax import causal_softmax
 from .embedding import embedding
+from .flash_attention import flash_attention
 from .linear import linear
+from .linear_w8a8i8 import linear_w8a8i8
 from .random_sample import random_sample
 from .rms_norm import rms_norm
 from .rope import RopeAlgo, rope
 from .silu import silu
+from .silu_and_mul import silu_and_mul
 from .swiglu import swiglu
 __all__ = [
    "causal_softmax",
+    "embedding",
+    "flash_attention",
+    "linear",
    "random_sample",
    "rms_norm",
+    "RopeAlgo",
+    "rope",
    "silu",
    "swiglu",
-    "linear",
+    "linear_w8a8i8",
-    "embedding",
+    "silu_and_mul",
-    "rope",
-    "RopeAlgo",
 ]
--- a/python/infinicore/nn/functional/embedding.py
+++ b/python/infinicore/nn/functional/embedding.py
@@ -22,9 +22,8 @@ def embedding(
        and (sparse is False)
    ), "Unsupported parameters."
-    assert "cpu" == input.device.type, (
+    # Note: embedding now supports device-side input for graph recording
-        "The device of 'input' variable must be on the CPU."
+    # The C++ implementation handles both CPU and device-side inputs
-    )
    if out is None:
        return Tensor(_infinicore.embedding(input._underlying, weight._underlying))

--- a/python/infinicore/nn/functional/flash_attention.py
+++ b/python/infinicore/nn/functional/flash_attention.py
+import math
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+def flash_attention(
+    query,
+    key,
+    value,
+    total_kv_len,
+    attn_mask=None,
+    dropout_p=0,
+    is_causal=False,
+    scale=None,
+    enable_gqa=False,
+):
+    assert attn_mask is None and dropout_p == 0 and not enable_gqa
+    emb_dim = query.shape[-1]
+    if scale is None:
+        scale = 1 / math.sqrt(emb_dim)
+    return Tensor(
+        _infinicore.flash_attention(
+            query._underlying,
+            key._underlying,
+            value._underlying,
+            total_kv_len._underlying,
+            scale,
+            is_causal,
+        )
+    )