Unverified Commit dce99862 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #1053 from InfiniTensor/issue/1033xmake

Issue/1033 patch aten and fa adaptations
parents 8d99a8f5 d6e44e84
......@@ -107,6 +107,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
| `--ali-ppu=[y\|n]` | 是否编译阿里 PPU 接口实现 | n
| `--ninetoothed=[y\|n]` | 是否编译九齿实现 | n
| `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n
| `--graph=[y\|n]` | 是否编译 cuda graph 接口实现 | n
##### 手动安装底层库
......@@ -154,6 +155,20 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
xmake f --ascend-npu=true -cv
```
##### 试验功能 -- 使用flash attention库中的算子
```shell
(1) 在third_party目录拉取cutlass和flash attn库的源码(不需要--recursive)
(2) 设置(1)中cutlass路径的环境变量CUTLASS_ROOT
(3) xmake配置环节额外打开 --aten 开关,并设置 --flash-attn 库位置,例:
xmake f --nv-gpu=y --ccl=y --cuda=$CUDA_HOME --aten=y --flash-attn=<path-to>/InfiniCore/third_party/flash-attention -cv
(4) flash attenion库会伴随infinicore_cpp_api一同编译安装
```
2. 编译安装
默认安装路径为 `$HOME/.infini`
......
......@@ -15,15 +15,15 @@ struct InfinicclComm;
typedef struct InfinicclComm *infinicclComm_t;
__C __export infiniStatus_t infinicclCommInitAll(
__INFINI_C __export infiniStatus_t infinicclCommInitAll(
infiniDevice_t device_type,
infinicclComm_t *comms,
int ndevice,
const int *device_ids);
__C __export infiniStatus_t infinicclCommDestroy(infinicclComm_t comm);
__INFINI_C __export infiniStatus_t infinicclCommDestroy(infinicclComm_t comm);
__C __export infiniStatus_t infinicclAllReduce(
__INFINI_C __export infiniStatus_t infinicclAllReduce(
void *sendbuf,
void *recvbuf,
size_t count,
......
......@@ -10,10 +10,10 @@
#endif
#ifdef __cplusplus
#define __C extern "C"
#define __INFINI_C extern "C"
#include <cstddef>
#else
#define __C
#define __INFINI_C
#include <stddef.h>
#endif
......
#ifdef ENABLE_ATEN
#pragma once
#include "../context/context.hpp"
#include "../tensor.hpp"
#include <ATen/ATen.h>
#ifdef ENABLE_NVIDIA_API
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#endif
namespace infinicore::adaptor {
inline at::ScalarType to_at_dtype(DataType dtype) {
switch (dtype) {
case DataType::F32:
return at::kFloat;
case DataType::F16:
return at::kHalf;
case DataType::BF16:
return at::kBFloat16;
case DataType::I32:
return at::kInt;
case DataType::I64:
return at::kLong;
default:
throw std::runtime_error("Unsupported dtype for ATen");
}
}
inline at::Device to_at_device(const Device &device) {
if (device.getType() == Device::Type::NVIDIA) {
return at::Device(at::kCUDA, device.getIndex());
} else if (device.getType() == Device::Type::CPU) {
return at::Device(at::kCPU);
} else {
throw std::runtime_error("Unsupported device type for ATen");
}
}
at::Tensor to_aten_tensor(const infinicore::Tensor &t);
#ifdef ENABLE_NVIDIA_API
c10::cuda::CUDAStream get_cuda_stream();
#endif
} // namespace infinicore::adaptor
#endif // ENABLE_ATEN
#ifdef ENABLE_FLASH_ATTN
#pragma once
#include "aten_adaptor.hpp"
namespace flash {
std::vector<at::Tensor>
mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
std::optional<at::Tensor> &out_, // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
const float p_dropout,
const float softmax_scale,
bool is_causal,
int window_size_left,
int window_size_right,
const float softcap,
const bool return_softmax,
std::optional<at::Generator> gen_);
std::vector<at::Tensor>
mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
std::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
const at::Tensor &cu_seqlens_q, // b+1
const at::Tensor &cu_seqlens_k, // b+1
std::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
std::optional<const at::Tensor> &leftpad_k_, // batch_size
std::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
int max_seqlen_q,
const int max_seqlen_k,
const float p_dropout,
const float softmax_scale,
const bool zero_tensors,
bool is_causal,
int window_size_left,
int window_size_right,
const float softcap,
const bool return_softmax,
std::optional<at::Generator> gen_);
std::vector<at::Tensor>
mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x multiple_of(head_size_og, 8)
const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size
const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size
const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size
const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size
const at::Tensor &softmax_lse, // b x h x seqlen_q
std::optional<at::Tensor> &dq_, // batch_size x seqlen_q x num_heads x head_size
std::optional<at::Tensor> &dk_, // batch_size x seqlen_k x num_heads_k x head_size
std::optional<at::Tensor> &dv_, // batch_size x seqlen_k x num_heads_k x head_size
std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
const float p_dropout, // probability to drop
const float softmax_scale,
const bool is_causal,
int window_size_left,
int window_size_right,
const float softcap,
const bool deterministic,
std::optional<at::Generator> gen_,
std::optional<at::Tensor> &rng_state);
std::vector<at::Tensor>
mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size
const at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
const at::Tensor &out, // total_q x num_heads x head_size
const at::Tensor &softmax_lse, // h x total_q, softmax logsumexp
std::optional<at::Tensor> &dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
std::optional<at::Tensor> &dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
std::optional<at::Tensor> &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
const at::Tensor &cu_seqlens_q, // b+1
const at::Tensor &cu_seqlens_k, // b+1
std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
const int max_seqlen_q,
const int max_seqlen_k, // max sequence length to choose the kernel
const float p_dropout, // probability to drop
const float softmax_scale,
const bool zero_tensors,
const bool is_causal,
int window_size_left,
int window_size_right,
const float softcap,
const bool deterministic,
std::optional<at::Generator> gen_,
std::optional<at::Tensor> &rng_state);
std::vector<at::Tensor>
mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size
const at::Tensor &kcache, // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
const at::Tensor &vcache, // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
std::optional<const at::Tensor> &k_, // batch_size x seqlen_knew x num_heads_k x head_size
std::optional<const at::Tensor> &v_, // batch_size x seqlen_knew x num_heads_k x head_size
std::optional<const at::Tensor> &seqlens_k_, // batch_size
std::optional<const at::Tensor> &rotary_cos_, // seqlen_ro x (rotary_dim / 2)
std::optional<const at::Tensor> &rotary_sin_, // seqlen_ro x (rotary_dim / 2)
std::optional<const at::Tensor> &cache_batch_idx_, // indices to index into the KV cache
std::optional<const at::Tensor> &leftpad_k_, // batch_size
std::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
std::optional<at::Tensor> &out_, // batch_size x seqlen_q x num_heads x head_size
const float softmax_scale,
bool is_causal,
int window_size_left,
int window_size_right,
const float softcap,
bool is_rotary_interleaved, // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
int num_splits);
} // namespace flash
#endif // ENABLE_FLASH_ATTN
#pragma once
#include "../device.hpp"
#include "common/op.hpp"
#include <optional>
namespace infinicore::op {
INFINICORE_GRAPH_OP_CLASS(
MultiheadAttentionVarlen,
Tensor,
const Tensor &,
const Tensor &,
const Tensor &,
const Tensor &,
const Tensor &,
const Tensor &,
int,
int,
std::optional<Tensor>,
float);
Tensor mha_varlen(const Tensor &q,
const Tensor &k,
const Tensor &v,
const Tensor &cum_seqlens_q,
const Tensor &cum_seqlens_k,
const Tensor &block_table,
int max_seqlen_q,
int max_seqlen_k,
std::optional<Tensor> alibi_slopes,
float scale);
void mha_varlen_(Tensor out,
const Tensor &q,
const Tensor &k,
const Tensor &v,
const Tensor &cum_seqlens_q,
const Tensor &cum_seqlens_k,
const Tensor &block_table,
int max_seqlen_q,
int max_seqlen_k,
std::optional<Tensor> alibi_slopes,
float scale);
} // namespace infinicore::op
......@@ -7,8 +7,8 @@ struct InfiniopHandle;
typedef struct InfiniopHandle *infiniopHandle_t;
__C __export infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr);
__INFINI_C __export infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr);
__C __export infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle);
__INFINI_C __export infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle);
#endif
......@@ -7,7 +7,7 @@
// Base descriptor for all operators
struct InfiniopDescriptor;
__C __export infiniStatus_t infiniopGetDescriptorDeviceType(const struct InfiniopDescriptor *desc_ptr, infiniDevice_t *device_type);
__C __export infiniStatus_t infiniopGetDescriptorDeviceId(const struct InfiniopDescriptor *desc_ptr, int *device_id);
__INFINI_C __export infiniStatus_t infiniopGetDescriptorDeviceType(const struct InfiniopDescriptor *desc_ptr, infiniDevice_t *device_type);
__INFINI_C __export infiniStatus_t infiniopGetDescriptorDeviceId(const struct InfiniopDescriptor *desc_ptr, int *device_id);
#endif //__INFINIOP_OPERATOR_DESCRIPTOR_API_H__
......@@ -5,15 +5,15 @@
typedef struct InfiniopDescriptor *infiniopAddDescriptor_t;
__C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
__INFINI_C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
infiniopAddDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);
__C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
__INFINI_C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
......@@ -21,6 +21,6 @@ __C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
const void *b,
void *stream);
__C __export infiniStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc);
#endif
......@@ -5,7 +5,7 @@
typedef struct InfiniopDescriptor *infiniopAddRMSNormDescriptor_t;
__C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor(
__INFINI_C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor(
infiniopHandle_t handle,
infiniopAddRMSNormDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
......@@ -15,9 +15,9 @@ __C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor(
infiniopTensorDescriptor_t weight_desc,
float epsilon);
__C __export infiniStatus_t infiniopGetAddRMSNormWorkspaceSize(infiniopAddRMSNormDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetAddRMSNormWorkspaceSize(infiniopAddRMSNormDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t desc,
__INFINI_C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
......@@ -27,6 +27,6 @@ __C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t de
const void *weight,
void *stream);
__C __export infiniStatus_t infiniopDestroyAddRMSNormDescriptor(infiniopAddRMSNormDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyAddRMSNormDescriptor(infiniopAddRMSNormDescriptor_t desc);
#endif
......@@ -7,7 +7,7 @@
typedef struct InfiniopDescriptor *infiniopAttentionDescriptor_t;
__C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t handle,
__INFINI_C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t handle,
infiniopAttentionDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t q_desc,
......@@ -17,9 +17,9 @@ __C __export infiniStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t h
infiniopTensorDescriptor_t v_cache_desc,
size_t pos);
__C __export infiniStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc,
__INFINI_C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
......@@ -30,5 +30,5 @@ __C __export infiniStatus_t infiniopAttention(infiniopAttentionDescriptor_t desc
void *v_cache,
void *stream);
__C __export infiniStatus_t infiniopDestroyAttentionDescriptor(infiniopAttentionDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyAttentionDescriptor(infiniopAttentionDescriptor_t desc);
#endif
......@@ -5,15 +5,15 @@
typedef struct InfiniopDescriptor *infiniopCausalSoftmaxDescriptor_t;
__C __export infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
__INFINI_C __export infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
infiniopHandle_t handle,
infiniopCausalSoftmaxDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc);
__C __export infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopCausalSoftmax(
__INFINI_C __export infiniStatus_t infiniopCausalSoftmax(
infiniopCausalSoftmaxDescriptor_t desc,
void *workspace,
size_t workspace_size,
......@@ -21,6 +21,6 @@ __C __export infiniStatus_t infiniopCausalSoftmax(
const void *x,
void *stream);
__C __export infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc);
#endif
......@@ -5,16 +5,16 @@
typedef struct InfiniopDescriptor *infiniopClipDescriptor_t;
__C __export infiniStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
__INFINI_C __export infiniStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
infiniopClipDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
infiniopTensorDescriptor_t min_val,
infiniopTensorDescriptor_t max_val);
__C __export infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc,
__INFINI_C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
......@@ -23,6 +23,6 @@ __C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc,
const void *max_val,
void *stream);
__C __export infiniStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
#endif
......@@ -5,7 +5,7 @@
typedef struct InfiniopDescriptor *infiniopConvDescriptor_t;
__C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle,
__INFINI_C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle,
infiniopConvDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
......@@ -16,10 +16,10 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle
void *dilations,
size_t n);
__C __export infiniStatus_t infiniopGetConvWorkspaceSize(infiniopConvDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetConvWorkspaceSize(infiniopConvDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, size_t workspace_size, void *y, const void *x, const void *w, const void *bias, void *stream);
__INFINI_C __export infiniStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, size_t workspace_size, void *y, const void *x, const void *w, const void *bias, void *stream);
__C __export infiniStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc);
#endif
......@@ -5,16 +5,16 @@
typedef struct InfiniopDescriptor *infiniopDequantizeAWQDescriptor_t;
__C __export infiniStatus_t infiniopCreateDequantizeAWQDescriptor(infiniopHandle_t handle,
__INFINI_C __export infiniStatus_t infiniopCreateDequantizeAWQDescriptor(infiniopHandle_t handle,
infiniopDequantizeAWQDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t qweight_desc,
infiniopTensorDescriptor_t scales_desc,
infiniopTensorDescriptor_t zeros_desc);
__C __export infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopDequantizeAWQ(infiniopDequantizeAWQDescriptor_t desc,
__INFINI_C __export infiniStatus_t infiniopDequantizeAWQ(infiniopDequantizeAWQDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
......@@ -23,6 +23,6 @@ __C __export infiniStatus_t infiniopDequantizeAWQ(infiniopDequantizeAWQDescripto
const void *zeros,
void *stream);
__C __export infiniStatus_t infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc);
#endif
......@@ -5,21 +5,21 @@
typedef struct InfiniopDescriptor *infiniopEmbeddingDescriptor_t;
__C __export infiniStatus_t infiniopCreateEmbeddingDescriptor(
__INFINI_C __export infiniStatus_t infiniopCreateEmbeddingDescriptor(
infiniopHandle_t handle,
infiniopEmbeddingDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t weight_desc);
__C __export infiniStatus_t infiniopEmbedding(
__INFINI_C __export infiniStatus_t infiniopEmbedding(
infiniopEmbeddingDescriptor_t desc,
void *output,
const void *input,
const void *weight,
void *stream);
__C __export infiniStatus_t infiniopDestroyEmbeddingDescriptor(
__INFINI_C __export infiniStatus_t infiniopDestroyEmbeddingDescriptor(
infiniopEmbeddingDescriptor_t desc);
#endif
......
......@@ -5,7 +5,7 @@
typedef struct InfiniopDescriptor *infiniopFlashAttentionDescriptor_t;
__C __export infiniStatus_t infiniopCreateFlashAttentionDescriptor(
__INFINI_C __export infiniStatus_t infiniopCreateFlashAttentionDescriptor(
infiniopHandle_t handle,
infiniopFlashAttentionDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
......@@ -16,11 +16,11 @@ __C __export infiniStatus_t infiniopCreateFlashAttentionDescriptor(
float scale,
char is_causal);
__C __export infiniStatus_t infiniopGetFlashAttentionWorkspaceSize(
__INFINI_C __export infiniStatus_t infiniopGetFlashAttentionWorkspaceSize(
infiniopFlashAttentionDescriptor_t desc,
size_t *size);
__C __export infiniStatus_t infiniopFlashAttention(
__INFINI_C __export infiniStatus_t infiniopFlashAttention(
infiniopFlashAttentionDescriptor_t desc,
void *workspace,
size_t workspace_size,
......@@ -31,6 +31,6 @@ __C __export infiniStatus_t infiniopFlashAttention(
const void *total_kv_len,
void *stream);
__C __export infiniStatus_t infiniopDestroyFlashAttentionDescriptor(
__INFINI_C __export infiniStatus_t infiniopDestroyFlashAttentionDescriptor(
infiniopFlashAttentionDescriptor_t desc);
#endif
......@@ -5,20 +5,20 @@
typedef struct InfiniopDescriptor *infiniopGeluDescriptor_t;
__C __export infiniStatus_t infiniopCreateGeluDescriptor(infiniopHandle_t handle,
__INFINI_C __export infiniStatus_t infiniopCreateGeluDescriptor(infiniopHandle_t handle,
infiniopGeluDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output,
infiniopTensorDescriptor_t intput);
__C __export infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopGelu(infiniopGeluDescriptor_t desc,
__INFINI_C __export infiniStatus_t infiniopGelu(infiniopGeluDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *intput,
void *stream);
__C __export infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc);
#endif
......@@ -5,15 +5,15 @@
typedef struct InfiniopDescriptor *infiniopGemmDescriptor_t;
__C __export infiniStatus_t infiniopCreateGemmDescriptor(infiniopHandle_t handle,
__INFINI_C __export infiniStatus_t infiniopCreateGemmDescriptor(infiniopHandle_t handle,
infiniopGemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
__C __export infiniStatus_t infiniopGetGemmWorkspaceSize(infiniopGemmDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetGemmWorkspaceSize(infiniopGemmDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopGemm(infiniopGemmDescriptor_t desc,
__INFINI_C __export infiniStatus_t infiniopGemm(infiniopGemmDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
......@@ -23,6 +23,6 @@ __C __export infiniStatus_t infiniopGemm(infiniopGemmDescriptor_t desc,
float beta,
void *stream);
__C __export infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc);
#endif
......@@ -5,7 +5,7 @@
typedef InfiniopDescriptor *infiniopI8GemmDescriptor_t;
__C __export infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
__INFINI_C __export infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
infiniopI8GemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t bias_desc,
......@@ -14,9 +14,9 @@ __C __export infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t hand
infiniopTensorDescriptor_t weights_desc,
infiniopTensorDescriptor_t weights_scale_desc);
__C __export infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
__INFINI_C __export infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
......@@ -27,6 +27,6 @@ __C __export infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
const void *weights_scale,
void *stream);
__C __export infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t desc);
__INFINI_C __export infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t desc);
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment