Unverified Commit 8d09630a authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge branch 'demo131' into Issue/862

parents ab52dead 012df56c
...@@ -133,6 +133,19 @@ public: ...@@ -133,6 +133,19 @@ public:
void debug() const; void debug() const;
/**
* Unsafe API that returns a new tensor with the same raw memory untracked by allocator
* This API is used for loosely tracking a piece of memory while allowing it to be reused,
* typically in a compute graph scenario.
*/
Tensor to_blob_() const;
/**
* Unsafe API that returns a new tensor with the same memory and let allocator retracks the memory.
* Should only be used on the tensor returned by to_blob_().
*/
Tensor resume_from_blob_() const;
/// ///
/// Data Transfer APIs /// Data Transfer APIs
/// ///
...@@ -294,9 +307,13 @@ protected: ...@@ -294,9 +307,13 @@ protected:
friend class Tensor; friend class Tensor;
private: protected:
TensorMetaData meta_; TensorMetaData meta_;
TensorData data_; TensorData data_;
private:
// Mark to indicate if the tensor is created from to_blob_()
bool to_blob_mark_ = false;
}; };
} // namespace infinicore } // namespace infinicore
...@@ -3,18 +3,27 @@ ...@@ -3,18 +3,27 @@
#include "infiniop/handle.h" #include "infiniop/handle.h"
#include "infiniop/ops/add.h" #include "infiniop/ops/add.h"
#include "infiniop/ops/add_rms_norm.h"
#include "infiniop/ops/attention.h" #include "infiniop/ops/attention.h"
#include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/clip.h" #include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h" #include "infiniop/ops/conv.h"
#include "infiniop/ops/dequantize_awq.h" #include "infiniop/ops/dequantize_awq.h"
#include "infiniop/ops/embedding.h"
#include "infiniop/ops/flash_attention.h"
#include "infiniop/ops/gelu.h" #include "infiniop/ops/gelu.h"
#include "infiniop/ops/gemm.h" #include "infiniop/ops/gemm.h"
#include "infiniop/ops/int8_gemm.h"
#include "infiniop/ops/kv_caching.h"
#include "infiniop/ops/layer_norm.h" #include "infiniop/ops/layer_norm.h"
#include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/logsoftmax.h"
#include "infiniop/ops/lp_norm.h" #include "infiniop/ops/lp_norm.h"
#include "infiniop/ops/mul.h" #include "infiniop/ops/mul.h"
#include "infiniop/ops/ones.h" #include "infiniop/ops/ones.h"
#include "infiniop/ops/paged_attention.h"
#include "infiniop/ops/paged_attention_prefill.h"
#include "infiniop/ops/paged_caching.h"
#include "infiniop/ops/quant/per_channel_quant_int8.h"
#include "infiniop/ops/random_sample.h" #include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h" #include "infiniop/ops/rearrange.h"
#include "infiniop/ops/relu.h" #include "infiniop/ops/relu.h"
...@@ -22,6 +31,7 @@ ...@@ -22,6 +31,7 @@
#include "infiniop/ops/rope.h" #include "infiniop/ops/rope.h"
#include "infiniop/ops/sigmoid.h" #include "infiniop/ops/sigmoid.h"
#include "infiniop/ops/silu.h" #include "infiniop/ops/silu.h"
#include "infiniop/ops/silu_and_mul.h"
#include "infiniop/ops/softmax.h" #include "infiniop/ops/softmax.h"
#include "infiniop/ops/softplus.h" #include "infiniop/ops/softplus.h"
#include "infiniop/ops/sub.h" #include "infiniop/ops/sub.h"
......
#ifndef __INFINIOP_ADD_RMS_NORM_API_H__
#define __INFINIOP_ADD_RMS_NORM_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopAddRMSNormDescriptor_t;
__C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor(
infiniopHandle_t handle,
infiniopAddRMSNormDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t residual_out_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t weight_desc,
float epsilon);
__C __export infiniStatus_t infiniopGetAddRMSNormWorkspaceSize(infiniopAddRMSNormDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
void *residual_out,
const void *a,
const void *b,
const void *weight,
void *stream);
__C __export infiniStatus_t infiniopDestroyAddRMSNormDescriptor(infiniopAddRMSNormDescriptor_t desc);
#endif
#ifndef __INFINIOP_EMBEDDING_API_H__
#define __INFINIOP_EMBEDDING_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopEmbeddingDescriptor_t;
__C __export infiniStatus_t infiniopCreateEmbeddingDescriptor(
infiniopHandle_t handle,
infiniopEmbeddingDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc);
__C __export infiniStatus_t infiniopEmbedding(
infiniopEmbeddingDescriptor_t desc,
void *output,
const void *input,
const void *weight,
void *stream);
__C __export infiniStatus_t infiniopDestroyEmbeddingDescriptor(
infiniopEmbeddingDescriptor_t desc);
#endif
#ifndef __INFINIOP_FLASH_ATTENTION_API_H__
#define __INFINIOP_FLASH_ATTENTION_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopFlashAttentionDescriptor_t;
__C __export infiniStatus_t infiniopCreateFlashAttentionDescriptor(
infiniopHandle_t handle,
infiniopFlashAttentionDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t q_desc,
infiniopTensorDescriptor_t k_desc,
infiniopTensorDescriptor_t v_desc,
infiniopTensorDescriptor_t total_kv_len,
float scale,
char is_causal);
__C __export infiniStatus_t infiniopGetFlashAttentionWorkspaceSize(
infiniopFlashAttentionDescriptor_t desc,
size_t *size);
__C __export infiniStatus_t infiniopFlashAttention(
infiniopFlashAttentionDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *q,
const void *k,
const void *v,
const void *total_kv_len,
void *stream);
__C __export infiniStatus_t infiniopDestroyFlashAttentionDescriptor(
infiniopFlashAttentionDescriptor_t desc);
#endif
#ifndef __INFINIOP_I8GEMM_API_H__
#define __INFINIOP_I8GEMM_API_H__
#include "../operator_descriptor.h"
typedef InfiniopDescriptor *infiniopI8GemmDescriptor_t;
__C __export infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
infiniopI8GemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t bias_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t weights_desc,
infiniopTensorDescriptor_t weights_scale_desc);
__C __export infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *bias,
const void *x,
const void *x_scale,
const void *weights,
const void *weights_scale,
void *stream);
__C __export infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t desc);
#endif
#ifndef __INFINIOP_KV_CACHING_API_H__
#define __INFINIOP_KV_CACHING_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopKVCachingDescriptor_t;
__C __export infiniStatus_t infiniopCreateKVCachingDescriptor(
infiniopHandle_t handle,
infiniopKVCachingDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t k_cache,
infiniopTensorDescriptor_t v_cache,
infiniopTensorDescriptor_t k,
infiniopTensorDescriptor_t v,
infiniopTensorDescriptor_t past_kv_lengths);
__C __export infiniStatus_t infiniopGetKVCachingWorkspaceSize(infiniopKVCachingDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopKVCaching(infiniopKVCachingDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *k_cache,
void *v_cache,
const void *k,
const void *v,
const void *past_kv_lengths,
void *stream);
__C __export infiniStatus_t infiniopDestroyKVCachingDescriptor(infiniopKVCachingDescriptor_t desc);
#endif
#ifndef __INFINIOP_PAGED_ATTENTION_API_H__
#define __INFINIOP_PAGED_ATTENTION_API_H__
#include "../operator_descriptor.h"
// Define an opaque handle for the Paged Attention descriptor.
typedef struct InfiniopDescriptor *infiniopPagedAttentionDescriptor_t;
/**
* @brief Creates a descriptor for the Paged Attention v1 operation.
*
* @param handle The library context handle.
* @param desc_ptr Pointer to the created descriptor.
* @param out_desc [Output] Shape: (num_seqs, num_heads, head_size).
* The output tensor for the attention mechanism.
* @param q_desc [Input] Shape: (num_seqs, num_heads, head_size).
* The query tensor.
* @param k_cache_desc [Input] Shape: (num_blocks, num_kv_heads, block_size, head_size).
* Paged key cache storing keys for all sequences.
* @param v_cache_desc [Input] Shape: (num_blocks, num_kv_heads, block_size, head_size).
* Paged value cache storing values for all sequences.
* @param block_tables_desc [Input] Shape: (num_seqs, max_num_blocks_per_seq).
* Maps each sequence to its physical block indices in the cache.
* Expected DType: int64_t (I64).
* @param seq_lens_desc [Input] Shape: (num_seqs,).
* The current logical length of each sequence.
* Expected DType: int64_t (I64).
* @param alibi_slopes_desc [Optional] Shape: (num_heads,).
* Slopes for ALiBi (Attention with Linear Biases). Can be NULL.
* @param scale The attention scaling factor (typically 1/sqrt(head_size)).
* @return infiniStatus_t Status code.
*/
__C __export infiniStatus_t infiniopCreatePagedAttentionDescriptor(
infiniopHandle_t handle,
infiniopPagedAttentionDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t q_desc,
infiniopTensorDescriptor_t k_cache_desc,
infiniopTensorDescriptor_t v_cache_desc,
infiniopTensorDescriptor_t block_tables_desc,
infiniopTensorDescriptor_t seq_lens_desc,
infiniopTensorDescriptor_t alibi_slopes_desc,
float scale);
/**
* @brief Retrieves the workspace size required for the Paged Attention operation.
*
* @param desc The Paged Attention descriptor.
* @param size A pointer to store the required workspace size in bytes.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopGetPagedAttentionWorkspaceSize(
infiniopPagedAttentionDescriptor_t desc, size_t *size);
/**
* @brief Executes the Paged Attention v1 operation.
*
* @param desc The Paged Attention descriptor.
* @param workspace Pointer to the workspace memory.
* @param workspace_size The size of the workspace.
* @param out Pointer to the output tensor data.
* @param q Pointer to the query tensor data.
* @param k_cache Pointer to the key cache data.
* @param v_cache Pointer to the value cache data.
* @param block_tables Pointer to the block tables data.
* @param seq_lens Pointer to the sequence lengths data.
* @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
* @param stream The CUDA stream for the operation. Can be NULL.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopPagedAttention(
infiniopPagedAttentionDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *q,
const void *k_cache,
const void *v_cache,
const void *block_tables,
const void *seq_lens,
const void *alibi_slopes,
void *stream);
/**
* @brief Destroys a Paged Attention descriptor.
*
* @param desc The descriptor to be destroyed.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopDestroyPagedAttentionDescriptor(
infiniopPagedAttentionDescriptor_t desc);
#endif // __INFINIOP_PAGED_ATTENTION_API_H__
#ifndef __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
#define __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
#include "../operator_descriptor.h"
// Define an opaque handle for the Paged Attention Prefill descriptor.
typedef struct InfiniopDescriptor *infiniopPagedAttentionPrefillDescriptor_t;
/**
* @brief Creates a descriptor for the Paged Attention Prefill operation.
* @param handle The handle to the InfiniOP library context.
* @param desc_ptr A pointer to store the created descriptor.
* @param out_desc Descriptor for the output tensor.
* Shape: [total_q_tokens, num_heads, head_size]
* @param q_desc Descriptor for the query tensor (packed/flattened).
* Shape: [total_q_tokens, num_heads, head_size]
* @param k_cache_desc Descriptor for the global physical key cache.
* Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
* @param v_cache_desc Descriptor for the global physical value cache.
* Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
* @param block_tables_desc Descriptor for the block tables mapping logic to physical blocks.
* Shape: [batch_size, max_blocks_per_seq]
* @param seq_lens_desc Descriptor for the total KV lengths of each sequence.
* Shape: [batch_size]
* @param cum_seq_lens_q_desc Descriptor for the cumulative start position (prefix sum) of each Q sequence.
* Shape: [batch_size + 1]
* @param alibi_slopes_desc Optional descriptor for the ALiBi slopes tensor. Can be NULL.
* Shape: [num_heads]
* @param scale The attention scaling factor (typically 1.0 / sqrt(head_size)).
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
infiniopHandle_t handle,
infiniopPagedAttentionPrefillDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t q_desc,
infiniopTensorDescriptor_t k_cache_desc,
infiniopTensorDescriptor_t v_cache_desc,
infiniopTensorDescriptor_t block_tables_desc,
infiniopTensorDescriptor_t seq_lens_desc,
infiniopTensorDescriptor_t cum_seq_lens_q_desc,
infiniopTensorDescriptor_t alibi_slopes_desc,
float scale);
/**
* @brief Retrieves the workspace size required for the Paged Attention Prefill operation.
*/
__C __export infiniStatus_t infiniopGetPagedAttentionPrefillWorkspaceSize(
infiniopPagedAttentionPrefillDescriptor_t desc, size_t *size);
/**
* @brief Executes the Paged Attention Prefill operation.
* @param desc The Paged Attention Prefill descriptor.
* @param workspace Pointer to the workspace memory.
* @param workspace_size The size of the workspace.
* @param out Pointer to the output tensor data.
* @param q Pointer to the query tensor data (packed).
* @param k_cache Pointer to the global key cache data.
* @param v_cache Pointer to the global value cache data.
* @param block_tables Pointer to the block tables data.
* @param seq_lens Pointer to the KV lengths data.
* @param cum_seq_lens_q Pointer to the Q cumulative sequence lengths data (prefix sum).
* @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
* @param stream The device stream (e.g., cudaStream_t) for the operation.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopPagedAttentionPrefill(
infiniopPagedAttentionPrefillDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *q,
const void *k_cache,
const void *v_cache,
const void *block_tables,
const void *seq_lens,
const void *cum_seq_lens_q,
const void *alibi_slopes,
void *stream);
/**
* @brief Destroys a Paged Attention Prefill descriptor.
*/
__C __export infiniStatus_t infiniopDestroyPagedAttentionPrefillDescriptor(
infiniopPagedAttentionPrefillDescriptor_t desc);
#endif // __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
#ifndef __INFINIOP_PAGED_CACHING_API_H__
#define __INFINIOP_PAGED_CACHING_API_H__
#include "../operator_descriptor.h"
// Define an opaque handle for the Paged Caching descriptor.
typedef struct InfiniopDescriptor *infiniopPagedCachingDescriptor_t;
/**
* @brief Creates a descriptor for the Paged Caching operation.
*
* This function initializes a descriptor that holds all the metadata needed
* to copy key/value vectors into their respective cache pools.
*
* @param handle The handle to the InfiniOP library context.
* @param desc_ptr A pointer to store the created descriptor.
* @param k_cache_desc Descriptor for the key cache pool tensor.
* @param v_cache_desc Descriptor for the value cache pool tensor.
* @param k_desc Descriptor for the source key tensor.
* @param v_desc Descriptor for the source value tensor.
* @param slot_mapping_desc Descriptor for the slot mapping tensor.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopCreatePagedCachingDescriptor(
infiniopHandle_t handle,
infiniopPagedCachingDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t k_cache_desc,
infiniopTensorDescriptor_t v_cache_desc,
infiniopTensorDescriptor_t k_desc,
infiniopTensorDescriptor_t v_desc,
infiniopTensorDescriptor_t slot_mapping_desc);
/**
* @brief Retrieves the workspace size required for the Paged Caching operation.
*
* @param desc The Paged Caching descriptor.
* @param size A pointer to store the required workspace size in bytes (typically 0).
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopGetPagedCachingWorkspaceSize(
infiniopPagedCachingDescriptor_t desc, size_t *size);
/**
* @brief Executes the Paged Caching operation.
*
* @param desc The Paged Caching descriptor.
* @param workspace Pointer to the workspace memory.
* @param workspace_size The size of the workspace.
* @param k_cache Pointer to the key cache pool data.
* @param v_cache Pointer to the value cache pool data.
* @param k Pointer to the source key tensor data.
* @param v Pointer to the source value tensor data.
* @param slot_mapping Pointer to the slot mapping data.
* @param stream The CUDA stream for the operation. Can be NULL.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopPagedCaching(
infiniopPagedCachingDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *k_cache,
void *v_cache,
const void *k,
const void *v,
const void *slot_mapping,
void *stream);
/**
* @brief Destroys a Paged Caching descriptor.
*
* @param desc The descriptor to be destroyed.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopDestroyPagedCachingDescriptor(
infiniopPagedCachingDescriptor_t desc);
#endif // __INFINIOP_PAGED_CACHING_API_H__
#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#include "../../operator_descriptor.h"
typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;
__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc);
__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x_packed,
void *x_scale,
void *x_zero,
const void *x,
void *stream);
__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);
#endif
#ifndef __INFINIOP_SILU_AND_MUL_API_H__
#define __INFINIOP_SILU_AND_MUL_API_H__
#include "../operator_descriptor.h"
/**
* @brief Opaque handle for the SiluAndMul descriptor.
*/
typedef struct InfiniopDescriptor *infiniopSiluAndMulDescriptor_t;
/**
* @brief Creates a descriptor for the SiLU and Multiply (SiluAndMul) operation.
*
* Format: (input_shape, output_shape)
* Referencing vLLM kernel SiluAndMul interface:
* - input_shape is [..., 2*d] (last dimension is split into two halves for SiLU and multiplication)
* - output_shape is [..., d] (last dimension reduced to half)
*
* @param handle The handle to the InfiniOP library context.
* @param desc_ptr A pointer to store the created descriptor.
* @param output Descriptor for the output tensor. Shape [..., d].
* @param input Descriptor for the input tensor. Shape [..., 2*d].
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopCreateSiluAndMulDescriptor(
infiniopHandle_t handle,
infiniopSiluAndMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output,
infiniopTensorDescriptor_t input);
/**
* @brief Queries the workspace size required for SiluAndMul computation.
* @param desc The SiluAndMul descriptor.
* @param size Pointer to store the required workspace size in bytes.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopGetSiluAndMulWorkspaceSize(
infiniopSiluAndMulDescriptor_t desc,
size_t *size);
/**
* @brief Executes the SiluAndMul operation.
*
* Performs SiLU activation on the first half of the last dimension of `input`,
* multiplies element-wise with the second half, and stores the result in `output`.
*
* @param desc The SiluAndMul descriptor.
* @param workspace Pointer to workspace memory allocated according to GetWorkspaceSize().
* @param workspace_size Size of the workspace in bytes.
* @param output Pointer to the output tensor memory. Shape [..., d].
* @param input Pointer to the input tensor memory. Shape [..., 2*d].
* @param stream Pointer to the execution stream (e.g., CUDA stream). Can be NULL for default stream.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopSiluAndMul(
infiniopSiluAndMulDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream);
/**
* @brief Destroys a previously created SiluAndMul descriptor.
* @param desc The descriptor to destroy.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopDestroySiluAndMulDescriptor(
infiniopSiluAndMulDescriptor_t desc);
#endif // __INFINIOP_SILU_AND_MUL_API_H__
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
typedef void *infinirtStream_t; typedef void *infinirtStream_t;
typedef void *infinirtEvent_t; typedef void *infinirtEvent_t;
typedef void *infinirtGraph_t;
typedef void *infinirtGraphNode_t;
typedef void *infinirtGraphExec_t;
__C __export infiniStatus_t infinirtInit(); __C __export infiniStatus_t infinirtInit();
...@@ -63,4 +66,24 @@ __C __export infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size ...@@ -63,4 +66,24 @@ __C __export infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size
__C __export infiniStatus_t infinirtMallocAsync(void **p_ptr, size_t size, infinirtStream_t stream); __C __export infiniStatus_t infinirtMallocAsync(void **p_ptr, size_t size, infinirtStream_t stream);
__C __export infiniStatus_t infinirtFreeAsync(void *ptr, infinirtStream_t stream); __C __export infiniStatus_t infinirtFreeAsync(void *ptr, infinirtStream_t stream);
// Graph
typedef enum {
INFINIRT_STREAM_CAPTURE_MODE_GLOBAL = 0,
INFINIRT_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
INFINIRT_STREAM_CAPTURE_MODE_RELAXED = 2,
} infinirtStreamCaptureMode_t;
__C __export infiniStatus_t infinirtStreamBeginCapture(infinirtStream_t stream, infinirtStreamCaptureMode_t mode);
__C __export infiniStatus_t infinirtStreamEndCapture(infinirtStream_t stream, infinirtGraph_t *graph_ptr);
__C __export infiniStatus_t infinirtGraphDestroy(infinirtGraph_t graph);
__C __export infiniStatus_t infinirtGraphInstantiate(
infinirtGraphExec_t *graph_exec_ptr,
infinirtGraph_t graph,
infinirtGraphNode_t *node_ptr,
char *log_buffer,
size_t buffer_size);
__C __export infiniStatus_t infinirtGraphExecDestroy(infinirtGraphExec_t graph_exec);
__C __export infiniStatus_t infinirtGraphLuanch(infinirtGraphExec_t graph_exec, infinirtStream_t stream);
#endif // __INFINIRT_API_H__ #endif // __INFINIRT_API_H__
...@@ -8,7 +8,10 @@ from infinicore.context import ( ...@@ -8,7 +8,10 @@ from infinicore.context import (
get_device, get_device,
get_device_count, get_device_count,
get_stream, get_stream,
is_graph_recording,
set_device, set_device,
start_graph_recording,
stop_graph_recording,
sync_device, sync_device,
sync_stream, sync_stream,
) )
...@@ -40,10 +43,15 @@ from infinicore.dtype import ( ...@@ -40,10 +43,15 @@ from infinicore.dtype import (
uint8, uint8,
) )
from infinicore.ops.add import add from infinicore.ops.add import add
from infinicore.ops.add_rms_norm import add_rms_norm
from infinicore.ops.attention import attention from infinicore.ops.attention import attention
from infinicore.ops.kv_caching import kv_caching
from infinicore.ops.matmul import matmul from infinicore.ops.matmul import matmul
from infinicore.ops.mul import mul from infinicore.ops.mul import mul
from infinicore.ops.narrow import narrow from infinicore.ops.narrow import narrow
from infinicore.ops.paged_attention import paged_attention
from infinicore.ops.paged_attention_prefill import paged_attention_prefill
from infinicore.ops.paged_caching import paged_caching
from infinicore.ops.rearrange import rearrange from infinicore.ops.rearrange import rearrange
from infinicore.ops.squeeze import squeeze from infinicore.ops.squeeze import squeeze
from infinicore.ops.unsqueeze import unsqueeze from infinicore.ops.unsqueeze import unsqueeze
...@@ -77,6 +85,9 @@ __all__ = [ ...@@ -77,6 +85,9 @@ __all__ = [
"set_device", "set_device",
"sync_device", "sync_device",
"sync_stream", "sync_stream",
"is_graph_recording",
"start_graph_recording",
"stop_graph_recording",
# Data Types. # Data Types.
"bfloat16", "bfloat16",
"bool", "bool",
...@@ -102,7 +113,10 @@ __all__ = [ ...@@ -102,7 +113,10 @@ __all__ = [
"uint8", "uint8",
# Operations. # Operations.
"add", "add",
"add_rms_norm",
"add_rms_norm_",
"attention", "attention",
"kv_caching",
"matmul", "matmul",
"mul", "mul",
"narrow", "narrow",
...@@ -115,6 +129,9 @@ __all__ = [ ...@@ -115,6 +129,9 @@ __all__ = [
"from_list", "from_list",
"from_numpy", "from_numpy",
"from_torch", "from_torch",
"paged_caching",
"paged_attention",
"paged_attention_prefill",
"ones", "ones",
"strided_empty", "strided_empty",
"strided_from_blob", "strided_from_blob",
......
import infinicore.device import infinicore.device
from infinicore.graph import Graph
from infinicore.lib import _infinicore from infinicore.lib import _infinicore
...@@ -49,3 +50,24 @@ def get_stream(): ...@@ -49,3 +50,24 @@ def get_stream():
stream: The current stream object stream: The current stream object
""" """
return _infinicore.get_stream() return _infinicore.get_stream()
def is_graph_recording():
"""Check if the current graph is recording.
Returns:
bool: True if the current graph is recording, False otherwise
"""
return _infinicore.is_graph_recording()
def start_graph_recording(device=None):
"""Start recording the current graph."""
if device is not None:
set_device(device)
_infinicore.start_graph_recording()
def stop_graph_recording():
"""Stop recording the current graph."""
return Graph(_infinicore.stop_graph_recording())
...@@ -34,7 +34,10 @@ class device: ...@@ -34,7 +34,10 @@ class device:
def __getattr__(self, name): def __getattr__(self, name):
# Lazily construct and cache an attribute. # Lazily construct and cache an attribute.
# such as, self._underlying . # such as, self._underlying .
setattr(self, name, device._to_infinicore_device(self.type, self.index)) if name == "_underlying":
setattr(self, name, device._to_infinicore_device(self.type, self.index))
else:
raise AttributeError("{!r} object has no attribute {!r}".format(self, name))
return getattr(self, name) return getattr(self, name)
def __repr__(self): def __repr__(self):
...@@ -79,6 +82,7 @@ _TORCH_DEVICE_MAP = { ...@@ -79,6 +82,7 @@ _TORCH_DEVICE_MAP = {
_infinicore.Device.Type.KUNLUN: "cuda", _infinicore.Device.Type.KUNLUN: "cuda",
_infinicore.Device.Type.HYGON: "cuda", _infinicore.Device.Type.HYGON: "cuda",
_infinicore.Device.Type.QY: "cuda", _infinicore.Device.Type.QY: "cuda",
_infinicore.Device.Type.ALI: "cuda",
} }
......
from infinicore.lib import _infinicore
class Graph:
"""
Python wrapper around a InfiniCore Graph instance.
"""
def __init__(self, graph: _infinicore.Graph):
if not isinstance(graph, _infinicore.Graph):
raise TypeError("Expected _infinicore.Graph")
self._graph = graph
def run(self):
return self._graph.run()
def __repr__(self):
return f"<Graph wrapper of {self._graph!r}>"
from .causal_softmax import causal_softmax from .causal_softmax import causal_softmax
from .embedding import embedding from .embedding import embedding
from .flash_attention import flash_attention
from .linear import linear from .linear import linear
from .linear_w8a8i8 import linear_w8a8i8
from .random_sample import random_sample from .random_sample import random_sample
from .rms_norm import rms_norm from .rms_norm import rms_norm
from .rope import RopeAlgo, rope from .rope import RopeAlgo, rope
from .silu import silu from .silu import silu
from .silu_and_mul import silu_and_mul
from .swiglu import swiglu from .swiglu import swiglu
__all__ = [ __all__ = [
"causal_softmax", "causal_softmax",
"embedding",
"flash_attention",
"linear",
"random_sample", "random_sample",
"rms_norm", "rms_norm",
"RopeAlgo",
"rope",
"silu", "silu",
"swiglu", "swiglu",
"linear", "linear_w8a8i8",
"embedding", "silu_and_mul",
"rope",
"RopeAlgo",
] ]
...@@ -22,9 +22,8 @@ def embedding( ...@@ -22,9 +22,8 @@ def embedding(
and (sparse is False) and (sparse is False)
), "Unsupported parameters." ), "Unsupported parameters."
assert "cpu" == input.device.type, ( # Note: embedding now supports device-side input for graph recording
"The device of 'input' variable must be on the CPU." # The C++ implementation handles both CPU and device-side inputs
)
if out is None: if out is None:
return Tensor(_infinicore.embedding(input._underlying, weight._underlying)) return Tensor(_infinicore.embedding(input._underlying, weight._underlying))
......
import math
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def flash_attention(
query,
key,
value,
total_kv_len,
attn_mask=None,
dropout_p=0,
is_causal=False,
scale=None,
enable_gqa=False,
):
assert attn_mask is None and dropout_p == 0 and not enable_gqa
emb_dim = query.shape[-1]
if scale is None:
scale = 1 / math.sqrt(emb_dim)
return Tensor(
_infinicore.flash_attention(
query._underlying,
key._underlying,
value._underlying,
total_kv_len._underlying,
scale,
is_causal,
)
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment