Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
8d09630a
Unverified
Commit
8d09630a
authored
Feb 11, 2026
by
gongchensu
Committed by
GitHub
Feb 11, 2026
Browse files
Merge branch 'demo131' into Issue/862
parents
ab52dead
012df56c
Changes
387
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
672 additions
and
9 deletions
+672
-9
include/infinicore/tensor.hpp
include/infinicore/tensor.hpp
+18
-1
include/infiniop.h
include/infiniop.h
+10
-0
include/infiniop/ops/add_rms_norm.h
include/infiniop/ops/add_rms_norm.h
+32
-0
include/infiniop/ops/embedding.h
include/infiniop/ops/embedding.h
+26
-0
include/infiniop/ops/flash_attention.h
include/infiniop/ops/flash_attention.h
+36
-0
include/infiniop/ops/int8_gemm.h
include/infiniop/ops/int8_gemm.h
+32
-0
include/infiniop/ops/kv_caching.h
include/infiniop/ops/kv_caching.h
+31
-0
include/infiniop/ops/paged_attention.h
include/infiniop/ops/paged_attention.h
+93
-0
include/infiniop/ops/paged_attention_prefill.h
include/infiniop/ops/paged_attention_prefill.h
+87
-0
include/infiniop/ops/paged_caching.h
include/infiniop/ops/paged_caching.h
+77
-0
include/infiniop/ops/quant/per_channel_quant_int8.h
include/infiniop/ops/quant/per_channel_quant_int8.h
+28
-0
include/infiniop/ops/silu_and_mul.h
include/infiniop/ops/silu_and_mul.h
+71
-0
include/infinirt.h
include/infinirt.h
+23
-0
python/infinicore/__init__.py
python/infinicore/__init__.py
+17
-0
python/infinicore/context.py
python/infinicore/context.py
+22
-0
python/infinicore/device.py
python/infinicore/device.py
+5
-1
python/infinicore/graph.py
python/infinicore/graph.py
+18
-0
python/infinicore/nn/functional/__init__.py
python/infinicore/nn/functional/__init__.py
+10
-4
python/infinicore/nn/functional/embedding.py
python/infinicore/nn/functional/embedding.py
+2
-3
python/infinicore/nn/functional/flash_attention.py
python/infinicore/nn/functional/flash_attention.py
+34
-0
No files found.
include/infinicore/tensor.hpp
View file @
8d09630a
...
@@ -133,6 +133,19 @@ public:
...
@@ -133,6 +133,19 @@ public:
void
debug
()
const
;
void
debug
()
const
;
/**
* Unsafe API that returns a new tensor with the same raw memory untracked by allocator
* This API is used for loosely tracking a piece of memory while allowing it to be reused,
* typically in a compute graph scenario.
*/
Tensor
to_blob_
()
const
;
/**
* Unsafe API that returns a new tensor with the same memory and let allocator retracks the memory.
* Should only be used on the tensor returned by to_blob_().
*/
Tensor
resume_from_blob_
()
const
;
///
///
/// Data Transfer APIs
/// Data Transfer APIs
///
///
...
@@ -294,9 +307,13 @@ protected:
...
@@ -294,9 +307,13 @@ protected:
friend
class
Tensor
;
friend
class
Tensor
;
pr
ivate
:
pr
otected
:
TensorMetaData
meta_
;
TensorMetaData
meta_
;
TensorData
data_
;
TensorData
data_
;
private:
// Mark to indicate if the tensor is created from to_blob_()
bool
to_blob_mark_
=
false
;
};
};
}
// namespace infinicore
}
// namespace infinicore
include/infiniop.h
View file @
8d09630a
...
@@ -3,18 +3,27 @@
...
@@ -3,18 +3,27 @@
#include "infiniop/handle.h"
#include "infiniop/handle.h"
#include "infiniop/ops/add.h"
#include "infiniop/ops/add.h"
#include "infiniop/ops/add_rms_norm.h"
#include "infiniop/ops/attention.h"
#include "infiniop/ops/attention.h"
#include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/clip.h"
#include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/dequantize_awq.h"
#include "infiniop/ops/dequantize_awq.h"
#include "infiniop/ops/embedding.h"
#include "infiniop/ops/flash_attention.h"
#include "infiniop/ops/gelu.h"
#include "infiniop/ops/gelu.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/int8_gemm.h"
#include "infiniop/ops/kv_caching.h"
#include "infiniop/ops/layer_norm.h"
#include "infiniop/ops/layer_norm.h"
#include "infiniop/ops/logsoftmax.h"
#include "infiniop/ops/logsoftmax.h"
#include "infiniop/ops/lp_norm.h"
#include "infiniop/ops/lp_norm.h"
#include "infiniop/ops/mul.h"
#include "infiniop/ops/mul.h"
#include "infiniop/ops/ones.h"
#include "infiniop/ops/ones.h"
#include "infiniop/ops/paged_attention.h"
#include "infiniop/ops/paged_attention_prefill.h"
#include "infiniop/ops/paged_caching.h"
#include "infiniop/ops/quant/per_channel_quant_int8.h"
#include "infiniop/ops/random_sample.h"
#include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h"
#include "infiniop/ops/rearrange.h"
#include "infiniop/ops/relu.h"
#include "infiniop/ops/relu.h"
...
@@ -22,6 +31,7 @@
...
@@ -22,6 +31,7 @@
#include "infiniop/ops/rope.h"
#include "infiniop/ops/rope.h"
#include "infiniop/ops/sigmoid.h"
#include "infiniop/ops/sigmoid.h"
#include "infiniop/ops/silu.h"
#include "infiniop/ops/silu.h"
#include "infiniop/ops/silu_and_mul.h"
#include "infiniop/ops/softmax.h"
#include "infiniop/ops/softmax.h"
#include "infiniop/ops/softplus.h"
#include "infiniop/ops/softplus.h"
#include "infiniop/ops/sub.h"
#include "infiniop/ops/sub.h"
...
...
include/infiniop/ops/add_rms_norm.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_ADD_RMS_NORM_API_H__
#define __INFINIOP_ADD_RMS_NORM_API_H__
#include "../operator_descriptor.h"
typedef
struct
InfiniopDescriptor
*
infiniopAddRMSNormDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateAddRMSNormDescriptor
(
infiniopHandle_t
handle
,
infiniopAddRMSNormDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
residual_out_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
,
infiniopTensorDescriptor_t
weight_desc
,
float
epsilon
);
__C
__export
infiniStatus_t
infiniopGetAddRMSNormWorkspaceSize
(
infiniopAddRMSNormDescriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopAddRMSNorm
(
infiniopAddRMSNormDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
void
*
residual_out
,
const
void
*
a
,
const
void
*
b
,
const
void
*
weight
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyAddRMSNormDescriptor
(
infiniopAddRMSNormDescriptor_t
desc
);
#endif
include/infiniop/ops/embedding.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_EMBEDDING_API_H__
#define __INFINIOP_EMBEDDING_API_H__
#include "../operator_descriptor.h"
typedef
struct
InfiniopDescriptor
*
infiniopEmbeddingDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateEmbeddingDescriptor
(
infiniopHandle_t
handle
,
infiniopEmbeddingDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
infiniopTensorDescriptor_t
weight_desc
);
__C
__export
infiniStatus_t
infiniopEmbedding
(
infiniopEmbeddingDescriptor_t
desc
,
void
*
output
,
const
void
*
input
,
const
void
*
weight
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyEmbeddingDescriptor
(
infiniopEmbeddingDescriptor_t
desc
);
#endif
include/infiniop/ops/flash_attention.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_FLASH_ATTENTION_API_H__
#define __INFINIOP_FLASH_ATTENTION_API_H__
#include "../operator_descriptor.h"
typedef
struct
InfiniopDescriptor
*
infiniopFlashAttentionDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateFlashAttentionDescriptor
(
infiniopHandle_t
handle
,
infiniopFlashAttentionDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
q_desc
,
infiniopTensorDescriptor_t
k_desc
,
infiniopTensorDescriptor_t
v_desc
,
infiniopTensorDescriptor_t
total_kv_len
,
float
scale
,
char
is_causal
);
__C
__export
infiniStatus_t
infiniopGetFlashAttentionWorkspaceSize
(
infiniopFlashAttentionDescriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopFlashAttention
(
infiniopFlashAttentionDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
out
,
const
void
*
q
,
const
void
*
k
,
const
void
*
v
,
const
void
*
total_kv_len
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyFlashAttentionDescriptor
(
infiniopFlashAttentionDescriptor_t
desc
);
#endif
include/infiniop/ops/int8_gemm.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_I8GEMM_API_H__
#define __INFINIOP_I8GEMM_API_H__
#include "../operator_descriptor.h"
typedef
InfiniopDescriptor
*
infiniopI8GemmDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateI8GemmDescriptor
(
infiniopHandle_t
handle
,
infiniopI8GemmDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
bias_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
x_scale_desc
,
infiniopTensorDescriptor_t
weights_desc
,
infiniopTensorDescriptor_t
weights_scale_desc
);
__C
__export
infiniStatus_t
infiniopGetI8GemmWorkspaceSize
(
infiniopI8GemmDescriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopI8Gemm
(
infiniopI8GemmDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
out
,
const
void
*
bias
,
const
void
*
x
,
const
void
*
x_scale
,
const
void
*
weights
,
const
void
*
weights_scale
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyI8GemmDescriptor
(
infiniopI8GemmDescriptor_t
desc
);
#endif
include/infiniop/ops/kv_caching.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_KV_CACHING_API_H__
#define __INFINIOP_KV_CACHING_API_H__
#include "../operator_descriptor.h"
typedef
struct
InfiniopDescriptor
*
infiniopKVCachingDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateKVCachingDescriptor
(
infiniopHandle_t
handle
,
infiniopKVCachingDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
k_cache
,
infiniopTensorDescriptor_t
v_cache
,
infiniopTensorDescriptor_t
k
,
infiniopTensorDescriptor_t
v
,
infiniopTensorDescriptor_t
past_kv_lengths
);
__C
__export
infiniStatus_t
infiniopGetKVCachingWorkspaceSize
(
infiniopKVCachingDescriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopKVCaching
(
infiniopKVCachingDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
k_cache
,
void
*
v_cache
,
const
void
*
k
,
const
void
*
v
,
const
void
*
past_kv_lengths
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyKVCachingDescriptor
(
infiniopKVCachingDescriptor_t
desc
);
#endif
include/infiniop/ops/paged_attention.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_PAGED_ATTENTION_API_H__
#define __INFINIOP_PAGED_ATTENTION_API_H__
#include "../operator_descriptor.h"
// Define an opaque handle for the Paged Attention descriptor.
typedef
struct
InfiniopDescriptor
*
infiniopPagedAttentionDescriptor_t
;
/**
* @brief Creates a descriptor for the Paged Attention v1 operation.
*
* @param handle The library context handle.
* @param desc_ptr Pointer to the created descriptor.
* @param out_desc [Output] Shape: (num_seqs, num_heads, head_size).
* The output tensor for the attention mechanism.
* @param q_desc [Input] Shape: (num_seqs, num_heads, head_size).
* The query tensor.
* @param k_cache_desc [Input] Shape: (num_blocks, num_kv_heads, block_size, head_size).
* Paged key cache storing keys for all sequences.
* @param v_cache_desc [Input] Shape: (num_blocks, num_kv_heads, block_size, head_size).
* Paged value cache storing values for all sequences.
* @param block_tables_desc [Input] Shape: (num_seqs, max_num_blocks_per_seq).
* Maps each sequence to its physical block indices in the cache.
* Expected DType: int64_t (I64).
* @param seq_lens_desc [Input] Shape: (num_seqs,).
* The current logical length of each sequence.
* Expected DType: int64_t (I64).
* @param alibi_slopes_desc [Optional] Shape: (num_heads,).
* Slopes for ALiBi (Attention with Linear Biases). Can be NULL.
* @param scale The attention scaling factor (typically 1/sqrt(head_size)).
* @return infiniStatus_t Status code.
*/
__C
__export
infiniStatus_t
infiniopCreatePagedAttentionDescriptor
(
infiniopHandle_t
handle
,
infiniopPagedAttentionDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
q_desc
,
infiniopTensorDescriptor_t
k_cache_desc
,
infiniopTensorDescriptor_t
v_cache_desc
,
infiniopTensorDescriptor_t
block_tables_desc
,
infiniopTensorDescriptor_t
seq_lens_desc
,
infiniopTensorDescriptor_t
alibi_slopes_desc
,
float
scale
);
/**
* @brief Retrieves the workspace size required for the Paged Attention operation.
*
* @param desc The Paged Attention descriptor.
* @param size A pointer to store the required workspace size in bytes.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopGetPagedAttentionWorkspaceSize
(
infiniopPagedAttentionDescriptor_t
desc
,
size_t
*
size
);
/**
* @brief Executes the Paged Attention v1 operation.
*
* @param desc The Paged Attention descriptor.
* @param workspace Pointer to the workspace memory.
* @param workspace_size The size of the workspace.
* @param out Pointer to the output tensor data.
* @param q Pointer to the query tensor data.
* @param k_cache Pointer to the key cache data.
* @param v_cache Pointer to the value cache data.
* @param block_tables Pointer to the block tables data.
* @param seq_lens Pointer to the sequence lengths data.
* @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
* @param stream The CUDA stream for the operation. Can be NULL.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopPagedAttention
(
infiniopPagedAttentionDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
out
,
const
void
*
q
,
const
void
*
k_cache
,
const
void
*
v_cache
,
const
void
*
block_tables
,
const
void
*
seq_lens
,
const
void
*
alibi_slopes
,
void
*
stream
);
/**
* @brief Destroys a Paged Attention descriptor.
*
* @param desc The descriptor to be destroyed.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopDestroyPagedAttentionDescriptor
(
infiniopPagedAttentionDescriptor_t
desc
);
#endif // __INFINIOP_PAGED_ATTENTION_API_H__
include/infiniop/ops/paged_attention_prefill.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
#define __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
#include "../operator_descriptor.h"
// Define an opaque handle for the Paged Attention Prefill descriptor.
typedef
struct
InfiniopDescriptor
*
infiniopPagedAttentionPrefillDescriptor_t
;
/**
* @brief Creates a descriptor for the Paged Attention Prefill operation.
* @param handle The handle to the InfiniOP library context.
* @param desc_ptr A pointer to store the created descriptor.
* @param out_desc Descriptor for the output tensor.
* Shape: [total_q_tokens, num_heads, head_size]
* @param q_desc Descriptor for the query tensor (packed/flattened).
* Shape: [total_q_tokens, num_heads, head_size]
* @param k_cache_desc Descriptor for the global physical key cache.
* Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
* @param v_cache_desc Descriptor for the global physical value cache.
* Shape: [max_num_blocks, num_kv_heads, block_size, head_size]
* @param block_tables_desc Descriptor for the block tables mapping logic to physical blocks.
* Shape: [batch_size, max_blocks_per_seq]
* @param seq_lens_desc Descriptor for the total KV lengths of each sequence.
* Shape: [batch_size]
* @param cum_seq_lens_q_desc Descriptor for the cumulative start position (prefix sum) of each Q sequence.
* Shape: [batch_size + 1]
* @param alibi_slopes_desc Optional descriptor for the ALiBi slopes tensor. Can be NULL.
* Shape: [num_heads]
* @param scale The attention scaling factor (typically 1.0 / sqrt(head_size)).
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopCreatePagedAttentionPrefillDescriptor
(
infiniopHandle_t
handle
,
infiniopPagedAttentionPrefillDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
q_desc
,
infiniopTensorDescriptor_t
k_cache_desc
,
infiniopTensorDescriptor_t
v_cache_desc
,
infiniopTensorDescriptor_t
block_tables_desc
,
infiniopTensorDescriptor_t
seq_lens_desc
,
infiniopTensorDescriptor_t
cum_seq_lens_q_desc
,
infiniopTensorDescriptor_t
alibi_slopes_desc
,
float
scale
);
/**
* @brief Retrieves the workspace size required for the Paged Attention Prefill operation.
*/
__C
__export
infiniStatus_t
infiniopGetPagedAttentionPrefillWorkspaceSize
(
infiniopPagedAttentionPrefillDescriptor_t
desc
,
size_t
*
size
);
/**
* @brief Executes the Paged Attention Prefill operation.
* @param desc The Paged Attention Prefill descriptor.
* @param workspace Pointer to the workspace memory.
* @param workspace_size The size of the workspace.
* @param out Pointer to the output tensor data.
* @param q Pointer to the query tensor data (packed).
* @param k_cache Pointer to the global key cache data.
* @param v_cache Pointer to the global value cache data.
* @param block_tables Pointer to the block tables data.
* @param seq_lens Pointer to the KV lengths data.
* @param cum_seq_lens_q Pointer to the Q cumulative sequence lengths data (prefix sum).
* @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
* @param stream The device stream (e.g., cudaStream_t) for the operation.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopPagedAttentionPrefill
(
infiniopPagedAttentionPrefillDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
out
,
const
void
*
q
,
const
void
*
k_cache
,
const
void
*
v_cache
,
const
void
*
block_tables
,
const
void
*
seq_lens
,
const
void
*
cum_seq_lens_q
,
const
void
*
alibi_slopes
,
void
*
stream
);
/**
* @brief Destroys a Paged Attention Prefill descriptor.
*/
__C
__export
infiniStatus_t
infiniopDestroyPagedAttentionPrefillDescriptor
(
infiniopPagedAttentionPrefillDescriptor_t
desc
);
#endif // __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
include/infiniop/ops/paged_caching.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_PAGED_CACHING_API_H__
#define __INFINIOP_PAGED_CACHING_API_H__
#include "../operator_descriptor.h"
// Define an opaque handle for the Paged Caching descriptor.
typedef
struct
InfiniopDescriptor
*
infiniopPagedCachingDescriptor_t
;
/**
* @brief Creates a descriptor for the Paged Caching operation.
*
* This function initializes a descriptor that holds all the metadata needed
* to copy key/value vectors into their respective cache pools.
*
* @param handle The handle to the InfiniOP library context.
* @param desc_ptr A pointer to store the created descriptor.
* @param k_cache_desc Descriptor for the key cache pool tensor.
* @param v_cache_desc Descriptor for the value cache pool tensor.
* @param k_desc Descriptor for the source key tensor.
* @param v_desc Descriptor for the source value tensor.
* @param slot_mapping_desc Descriptor for the slot mapping tensor.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopCreatePagedCachingDescriptor
(
infiniopHandle_t
handle
,
infiniopPagedCachingDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
k_cache_desc
,
infiniopTensorDescriptor_t
v_cache_desc
,
infiniopTensorDescriptor_t
k_desc
,
infiniopTensorDescriptor_t
v_desc
,
infiniopTensorDescriptor_t
slot_mapping_desc
);
/**
* @brief Retrieves the workspace size required for the Paged Caching operation.
*
* @param desc The Paged Caching descriptor.
* @param size A pointer to store the required workspace size in bytes (typically 0).
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopGetPagedCachingWorkspaceSize
(
infiniopPagedCachingDescriptor_t
desc
,
size_t
*
size
);
/**
* @brief Executes the Paged Caching operation.
*
* @param desc The Paged Caching descriptor.
* @param workspace Pointer to the workspace memory.
* @param workspace_size The size of the workspace.
* @param k_cache Pointer to the key cache pool data.
* @param v_cache Pointer to the value cache pool data.
* @param k Pointer to the source key tensor data.
* @param v Pointer to the source value tensor data.
* @param slot_mapping Pointer to the slot mapping data.
* @param stream The CUDA stream for the operation. Can be NULL.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopPagedCaching
(
infiniopPagedCachingDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
k_cache
,
void
*
v_cache
,
const
void
*
k
,
const
void
*
v
,
const
void
*
slot_mapping
,
void
*
stream
);
/**
* @brief Destroys a Paged Caching descriptor.
*
* @param desc The descriptor to be destroyed.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopDestroyPagedCachingDescriptor
(
infiniopPagedCachingDescriptor_t
desc
);
#endif // __INFINIOP_PAGED_CACHING_API_H__
include/infiniop/ops/quant/per_channel_quant_int8.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#include "../../operator_descriptor.h"
typedef
InfiniopDescriptor
*
infiniopPerChannelQuantI8Descriptor_t
;
__C
__export
infiniStatus_t
infiniopCreatePerChannelQuantI8Descriptor
(
infiniopHandle_t
handle
,
infiniopPerChannelQuantI8Descriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
x_packed_desc
,
infiniopTensorDescriptor_t
x_scale_desc
,
infiniopTensorDescriptor_t
x_zero_desc
,
infiniopTensorDescriptor_t
x_desc
);
__C
__export
infiniStatus_t
infiniopGetPerChannelQuantI8WorkspaceSize
(
infiniopPerChannelQuantI8Descriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopPerChannelQuantI8
(
infiniopPerChannelQuantI8Descriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
x_packed
,
void
*
x_scale
,
void
*
x_zero
,
const
void
*
x
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyPerChannelQuantI8Descriptor
(
infiniopPerChannelQuantI8Descriptor_t
desc
);
#endif
include/infiniop/ops/silu_and_mul.h
0 → 100644
View file @
8d09630a
#ifndef __INFINIOP_SILU_AND_MUL_API_H__
#define __INFINIOP_SILU_AND_MUL_API_H__
#include "../operator_descriptor.h"
/**
* @brief Opaque handle for the SiluAndMul descriptor.
*/
typedef
struct
InfiniopDescriptor
*
infiniopSiluAndMulDescriptor_t
;
/**
* @brief Creates a descriptor for the SiLU and Multiply (SiluAndMul) operation.
*
* Format: (input_shape, output_shape)
* Referencing vLLM kernel SiluAndMul interface:
* - input_shape is [..., 2*d] (last dimension is split into two halves for SiLU and multiplication)
* - output_shape is [..., d] (last dimension reduced to half)
*
* @param handle The handle to the InfiniOP library context.
* @param desc_ptr A pointer to store the created descriptor.
* @param output Descriptor for the output tensor. Shape [..., d].
* @param input Descriptor for the input tensor. Shape [..., 2*d].
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopCreateSiluAndMulDescriptor
(
infiniopHandle_t
handle
,
infiniopSiluAndMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
output
,
infiniopTensorDescriptor_t
input
);
/**
* @brief Queries the workspace size required for SiluAndMul computation.
* @param desc The SiluAndMul descriptor.
* @param size Pointer to store the required workspace size in bytes.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopGetSiluAndMulWorkspaceSize
(
infiniopSiluAndMulDescriptor_t
desc
,
size_t
*
size
);
/**
* @brief Executes the SiluAndMul operation.
*
* Performs SiLU activation on the first half of the last dimension of `input`,
* multiplies element-wise with the second half, and stores the result in `output`.
*
* @param desc The SiluAndMul descriptor.
* @param workspace Pointer to workspace memory allocated according to GetWorkspaceSize().
* @param workspace_size Size of the workspace in bytes.
* @param output Pointer to the output tensor memory. Shape [..., d].
* @param input Pointer to the input tensor memory. Shape [..., 2*d].
* @param stream Pointer to the execution stream (e.g., CUDA stream). Can be NULL for default stream.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopSiluAndMul
(
infiniopSiluAndMulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
void
*
stream
);
/**
* @brief Destroys a previously created SiluAndMul descriptor.
* @param desc The descriptor to destroy.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopDestroySiluAndMulDescriptor
(
infiniopSiluAndMulDescriptor_t
desc
);
#endif // __INFINIOP_SILU_AND_MUL_API_H__
include/infinirt.h
View file @
8d09630a
...
@@ -6,6 +6,9 @@
...
@@ -6,6 +6,9 @@
typedef
void
*
infinirtStream_t
;
typedef
void
*
infinirtStream_t
;
typedef
void
*
infinirtEvent_t
;
typedef
void
*
infinirtEvent_t
;
typedef
void
*
infinirtGraph_t
;
typedef
void
*
infinirtGraphNode_t
;
typedef
void
*
infinirtGraphExec_t
;
__C
__export
infiniStatus_t
infinirtInit
();
__C
__export
infiniStatus_t
infinirtInit
();
...
@@ -63,4 +66,24 @@ __C __export infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size
...
@@ -63,4 +66,24 @@ __C __export infiniStatus_t infinirtMemcpyAsync(void *dst, const void *src, size
__C
__export
infiniStatus_t
infinirtMallocAsync
(
void
**
p_ptr
,
size_t
size
,
infinirtStream_t
stream
);
__C
__export
infiniStatus_t
infinirtMallocAsync
(
void
**
p_ptr
,
size_t
size
,
infinirtStream_t
stream
);
__C
__export
infiniStatus_t
infinirtFreeAsync
(
void
*
ptr
,
infinirtStream_t
stream
);
__C
__export
infiniStatus_t
infinirtFreeAsync
(
void
*
ptr
,
infinirtStream_t
stream
);
// Graph
typedef
enum
{
INFINIRT_STREAM_CAPTURE_MODE_GLOBAL
=
0
,
INFINIRT_STREAM_CAPTURE_MODE_THREAD_LOCAL
=
1
,
INFINIRT_STREAM_CAPTURE_MODE_RELAXED
=
2
,
}
infinirtStreamCaptureMode_t
;
__C
__export
infiniStatus_t
infinirtStreamBeginCapture
(
infinirtStream_t
stream
,
infinirtStreamCaptureMode_t
mode
);
__C
__export
infiniStatus_t
infinirtStreamEndCapture
(
infinirtStream_t
stream
,
infinirtGraph_t
*
graph_ptr
);
__C
__export
infiniStatus_t
infinirtGraphDestroy
(
infinirtGraph_t
graph
);
__C
__export
infiniStatus_t
infinirtGraphInstantiate
(
infinirtGraphExec_t
*
graph_exec_ptr
,
infinirtGraph_t
graph
,
infinirtGraphNode_t
*
node_ptr
,
char
*
log_buffer
,
size_t
buffer_size
);
__C
__export
infiniStatus_t
infinirtGraphExecDestroy
(
infinirtGraphExec_t
graph_exec
);
__C
__export
infiniStatus_t
infinirtGraphLuanch
(
infinirtGraphExec_t
graph_exec
,
infinirtStream_t
stream
);
#endif // __INFINIRT_API_H__
#endif // __INFINIRT_API_H__
python/infinicore/__init__.py
View file @
8d09630a
...
@@ -8,7 +8,10 @@ from infinicore.context import (
...
@@ -8,7 +8,10 @@ from infinicore.context import (
get_device
,
get_device
,
get_device_count
,
get_device_count
,
get_stream
,
get_stream
,
is_graph_recording
,
set_device
,
set_device
,
start_graph_recording
,
stop_graph_recording
,
sync_device
,
sync_device
,
sync_stream
,
sync_stream
,
)
)
...
@@ -40,10 +43,15 @@ from infinicore.dtype import (
...
@@ -40,10 +43,15 @@ from infinicore.dtype import (
uint8
,
uint8
,
)
)
from
infinicore.ops.add
import
add
from
infinicore.ops.add
import
add
from
infinicore.ops.add_rms_norm
import
add_rms_norm
from
infinicore.ops.attention
import
attention
from
infinicore.ops.attention
import
attention
from
infinicore.ops.kv_caching
import
kv_caching
from
infinicore.ops.matmul
import
matmul
from
infinicore.ops.matmul
import
matmul
from
infinicore.ops.mul
import
mul
from
infinicore.ops.mul
import
mul
from
infinicore.ops.narrow
import
narrow
from
infinicore.ops.narrow
import
narrow
from
infinicore.ops.paged_attention
import
paged_attention
from
infinicore.ops.paged_attention_prefill
import
paged_attention_prefill
from
infinicore.ops.paged_caching
import
paged_caching
from
infinicore.ops.rearrange
import
rearrange
from
infinicore.ops.rearrange
import
rearrange
from
infinicore.ops.squeeze
import
squeeze
from
infinicore.ops.squeeze
import
squeeze
from
infinicore.ops.unsqueeze
import
unsqueeze
from
infinicore.ops.unsqueeze
import
unsqueeze
...
@@ -77,6 +85,9 @@ __all__ = [
...
@@ -77,6 +85,9 @@ __all__ = [
"set_device"
,
"set_device"
,
"sync_device"
,
"sync_device"
,
"sync_stream"
,
"sync_stream"
,
"is_graph_recording"
,
"start_graph_recording"
,
"stop_graph_recording"
,
# Data Types.
# Data Types.
"bfloat16"
,
"bfloat16"
,
"bool"
,
"bool"
,
...
@@ -102,7 +113,10 @@ __all__ = [
...
@@ -102,7 +113,10 @@ __all__ = [
"uint8"
,
"uint8"
,
# Operations.
# Operations.
"add"
,
"add"
,
"add_rms_norm"
,
"add_rms_norm_"
,
"attention"
,
"attention"
,
"kv_caching"
,
"matmul"
,
"matmul"
,
"mul"
,
"mul"
,
"narrow"
,
"narrow"
,
...
@@ -115,6 +129,9 @@ __all__ = [
...
@@ -115,6 +129,9 @@ __all__ = [
"from_list"
,
"from_list"
,
"from_numpy"
,
"from_numpy"
,
"from_torch"
,
"from_torch"
,
"paged_caching"
,
"paged_attention"
,
"paged_attention_prefill"
,
"ones"
,
"ones"
,
"strided_empty"
,
"strided_empty"
,
"strided_from_blob"
,
"strided_from_blob"
,
...
...
python/infinicore/context.py
View file @
8d09630a
import
infinicore.device
import
infinicore.device
from
infinicore.graph
import
Graph
from
infinicore.lib
import
_infinicore
from
infinicore.lib
import
_infinicore
...
@@ -49,3 +50,24 @@ def get_stream():
...
@@ -49,3 +50,24 @@ def get_stream():
stream: The current stream object
stream: The current stream object
"""
"""
return
_infinicore
.
get_stream
()
return
_infinicore
.
get_stream
()
def
is_graph_recording
():
"""Check if the current graph is recording.
Returns:
bool: True if the current graph is recording, False otherwise
"""
return
_infinicore
.
is_graph_recording
()
def
start_graph_recording
(
device
=
None
):
"""Start recording the current graph."""
if
device
is
not
None
:
set_device
(
device
)
_infinicore
.
start_graph_recording
()
def
stop_graph_recording
():
"""Stop recording the current graph."""
return
Graph
(
_infinicore
.
stop_graph_recording
())
python/infinicore/device.py
View file @
8d09630a
...
@@ -34,7 +34,10 @@ class device:
...
@@ -34,7 +34,10 @@ class device:
def
__getattr__
(
self
,
name
):
def
__getattr__
(
self
,
name
):
# Lazily construct and cache an attribute.
# Lazily construct and cache an attribute.
# such as, self._underlying .
# such as, self._underlying .
setattr
(
self
,
name
,
device
.
_to_infinicore_device
(
self
.
type
,
self
.
index
))
if
name
==
"_underlying"
:
setattr
(
self
,
name
,
device
.
_to_infinicore_device
(
self
.
type
,
self
.
index
))
else
:
raise
AttributeError
(
"{!r} object has no attribute {!r}"
.
format
(
self
,
name
))
return
getattr
(
self
,
name
)
return
getattr
(
self
,
name
)
def
__repr__
(
self
):
def
__repr__
(
self
):
...
@@ -79,6 +82,7 @@ _TORCH_DEVICE_MAP = {
...
@@ -79,6 +82,7 @@ _TORCH_DEVICE_MAP = {
_infinicore
.
Device
.
Type
.
KUNLUN
:
"cuda"
,
_infinicore
.
Device
.
Type
.
KUNLUN
:
"cuda"
,
_infinicore
.
Device
.
Type
.
HYGON
:
"cuda"
,
_infinicore
.
Device
.
Type
.
HYGON
:
"cuda"
,
_infinicore
.
Device
.
Type
.
QY
:
"cuda"
,
_infinicore
.
Device
.
Type
.
QY
:
"cuda"
,
_infinicore
.
Device
.
Type
.
ALI
:
"cuda"
,
}
}
...
...
python/infinicore/graph.py
0 → 100644
View file @
8d09630a
from
infinicore.lib
import
_infinicore
class
Graph
:
"""
Python wrapper around a InfiniCore Graph instance.
"""
def
__init__
(
self
,
graph
:
_infinicore
.
Graph
):
if
not
isinstance
(
graph
,
_infinicore
.
Graph
):
raise
TypeError
(
"Expected _infinicore.Graph"
)
self
.
_graph
=
graph
def
run
(
self
):
return
self
.
_graph
.
run
()
def
__repr__
(
self
):
return
f
"<Graph wrapper of
{
self
.
_graph
!
r
}
>"
python/infinicore/nn/functional/__init__.py
View file @
8d09630a
from
.causal_softmax
import
causal_softmax
from
.causal_softmax
import
causal_softmax
from
.embedding
import
embedding
from
.embedding
import
embedding
from
.flash_attention
import
flash_attention
from
.linear
import
linear
from
.linear
import
linear
from
.linear_w8a8i8
import
linear_w8a8i8
from
.random_sample
import
random_sample
from
.random_sample
import
random_sample
from
.rms_norm
import
rms_norm
from
.rms_norm
import
rms_norm
from
.rope
import
RopeAlgo
,
rope
from
.rope
import
RopeAlgo
,
rope
from
.silu
import
silu
from
.silu
import
silu
from
.silu_and_mul
import
silu_and_mul
from
.swiglu
import
swiglu
from
.swiglu
import
swiglu
__all__
=
[
__all__
=
[
"causal_softmax"
,
"causal_softmax"
,
"embedding"
,
"flash_attention"
,
"linear"
,
"random_sample"
,
"random_sample"
,
"rms_norm"
,
"rms_norm"
,
"RopeAlgo"
,
"rope"
,
"silu"
,
"silu"
,
"swiglu"
,
"swiglu"
,
"linear"
,
"linear_w8a8i8"
,
"embedding"
,
"silu_and_mul"
,
"rope"
,
"RopeAlgo"
,
]
]
python/infinicore/nn/functional/embedding.py
View file @
8d09630a
...
@@ -22,9 +22,8 @@ def embedding(
...
@@ -22,9 +22,8 @@ def embedding(
and
(
sparse
is
False
)
and
(
sparse
is
False
)
),
"Unsupported parameters."
),
"Unsupported parameters."
assert
"cpu"
==
input
.
device
.
type
,
(
# Note: embedding now supports device-side input for graph recording
"The device of 'input' variable must be on the CPU."
# The C++ implementation handles both CPU and device-side inputs
)
if
out
is
None
:
if
out
is
None
:
return
Tensor
(
_infinicore
.
embedding
(
input
.
_underlying
,
weight
.
_underlying
))
return
Tensor
(
_infinicore
.
embedding
(
input
.
_underlying
,
weight
.
_underlying
))
...
...
python/infinicore/nn/functional/flash_attention.py
0 → 100644
View file @
8d09630a
import
math
from
infinicore.lib
import
_infinicore
from
infinicore.tensor
import
Tensor
def
flash_attention
(
query
,
key
,
value
,
total_kv_len
,
attn_mask
=
None
,
dropout_p
=
0
,
is_causal
=
False
,
scale
=
None
,
enable_gqa
=
False
,
):
assert
attn_mask
is
None
and
dropout_p
==
0
and
not
enable_gqa
emb_dim
=
query
.
shape
[
-
1
]
if
scale
is
None
:
scale
=
1
/
math
.
sqrt
(
emb_dim
)
return
Tensor
(
_infinicore
.
flash_attention
(
query
.
_underlying
,
key
.
_underlying
,
value
.
_underlying
,
total_kv_len
.
_underlying
,
scale
,
is_causal
,
)
)
Prev
1
2
3
4
5
6
7
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment