Commit 7a833987 authored by PanZezhongQY's avatar PanZezhongQY
Browse files

feat: 添加大模型算子operator.cc,cpu和cuda视warning为错误

parent e3ea5bae
...@@ -52,14 +52,14 @@ uint16_t f32_to_f16(float val) { ...@@ -52,14 +52,14 @@ uint16_t f32_to_f16(float val) {
// Infinity // Infinity
return sign | 0x7C00; return sign | 0x7C00;
} else if (exponent >= -14) {// Normalized case } else if (exponent >= -14) {// Normalized case
return sign | ((exponent + 15) << 10) | (mantissa >> 13); return (uint16_t)(sign | ((exponent + 15) << 10) | (mantissa >> 13));
} else if (exponent >= -24) { } else if (exponent >= -24) {
mantissa |= 0x800000;// Add implicit leading 1 mantissa |= 0x800000;// Add implicit leading 1
mantissa >>= (-14 - exponent); mantissa >>= (-14 - exponent);
return sign | (mantissa >> 13); return (uint16_t)(sign | (mantissa >> 13));
} else { } else {
// Too small for subnormal: return signed zero // Too small for subnormal: return signed zero
return sign; return (uint16_t)sign;
} }
} }
......
#include "infiniop/ops/causal_softmax.h"
__C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
infiniopHandle_t handle,
infiniopCausalSoftmaxDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateCausalSoftmaxDescriptor(handle, (CausalSoftmaxCpuDescriptor_t *) desc_ptr, y_desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxBangDescriptor_t *) desc_ptr, y_desc);
// return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t) handle, (CausalSoftmaxAclnnDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateCausalSoftmaxDescriptor((MacaHandle_t) handle, (CausalSoftmaxMacaDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCreateCausalSoftmaxDescriptor((MusaHandle_t) handle, (CausalSoftmaxMusaDescriptor_t *) desc_ptr, y_desc);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, uint64_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t) desc, size);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCudaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t) desc, size);
// return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t) desc, size);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *data, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t) desc, workspace, workspace_size, data, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCausalSoftmax((CausalSoftmaxCudaDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t) desc, workspace, workspace_size, data, stream);
// return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t) desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaDestroyCausalSoftmaxDescriptor((CausalSoftmaxCudaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t) desc);
// return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t) desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t) desc);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
...@@ -47,8 +47,8 @@ typedef struct BlasMatrix { ...@@ -47,8 +47,8 @@ typedef struct BlasMatrix {
*status = INFINIOP_STATUS_SUCCESS; *status = INFINIOP_STATUS_SUCCESS;
} }
bool match_batch(size_t batch) const { bool match_batch(size_t _batch) const {
return this->batch == batch || this->batch == 1; return this->batch == _batch || this->batch == 1;
} }
void transpose() { void transpose() {
...@@ -56,7 +56,7 @@ typedef struct BlasMatrix { ...@@ -56,7 +56,7 @@ typedef struct BlasMatrix {
std::swap(row_stride, col_stride); std::swap(row_stride, col_stride);
} }
int ld() const { int64_t ld() const {
if (this->row_stride == 1) { if (this->row_stride == 1) {
return this->col_stride; return this->col_stride;
} else { } else {
......
...@@ -31,24 +31,24 @@ infiniopStatus_t matmul_cuda(infiniopMatmulCudaDescriptor_t desc, void *c, float ...@@ -31,24 +31,24 @@ infiniopStatus_t matmul_cuda(infiniopMatmulCudaDescriptor_t desc, void *c, float
handle, handle,
op_a, op_a,
op_b, op_b,
info.m, static_cast<int>(info.m),
info.n, static_cast<int>(info.n),
info.k, static_cast<int>(info.k),
&alpha, &alpha,
a, a,
a_type, a_type,
info.a_matrix.ld(), static_cast<int>(info.a_matrix.ld()),
info.a_matrix.stride, info.a_matrix.stride,
b, b,
b_type, b_type,
info.b_matrix.ld(), static_cast<int>(info.b_matrix.ld()),
info.b_matrix.stride, info.b_matrix.stride,
&beta, &beta,
c, c,
c_type, c_type,
info.c_matrix.ld(), static_cast<int>(info.c_matrix.ld()),
info.c_matrix.stride, info.c_matrix.stride,
info.batch, static_cast<int>(info.batch),
compute_type, compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP); }); CUBLAS_GEMM_DEFAULT_TENSOR_OP); });
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
......
#include "../utils.h"
#include "infiniop/ops/matmul.h" #include "infiniop/ops/matmul.h"
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
......
#include "infiniop/ops/random_sample.h"
__C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *) desc_ptr, result, probs);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaCreateRandomSampleDescriptor((CudaHandle_t) handle, (RandomSampleCudaDescriptor_t *) desc_ptr, result, probs);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateRandomSampleDescriptor((BangHandle_t) handle,
(RandomSampleBangDescriptor_t *) desc_ptr, result,
probs);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle,
(RandomSampleAscendDescriptor_t *) desc_ptr, result, probs);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateRandomSampleDescriptor((MacaHandle_t) handle,
(RandomSampleMacaDescriptor_t *) desc_ptr, result,
probs);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaCreateRandomSampleDescriptor((MusaHandle_t) handle, (RandomSampleMusaDescriptor_t *) desc_ptr, result, probs);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
};
__C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t) desc, size);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t) desc, size);
// return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t) desc, size);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *result,
void const *probs,
float random_val,
float topp,
int topk,
float temperature,
void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuRandomSample((RandomSampleCpuDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaRandomSample((RandomSampleCudaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangRandomSample((RandomSampleBangDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaRandomSample((RandomSampleMacaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaRandomSample((RandomSampleMusaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t) desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t) desc);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t) desc);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#include "infiniop/ops/rearrange.h"
__C infiniopStatus_t infiniopCreateRearrangeDescriptor(
infiniopHandle_t handle,
infiniopRearrangeDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t dst,
infiniopTensorDescriptor_t src) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *) desc_ptr, dst, src);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateRearrangeDescriptor((CudaHandle_t) handle, (RearrangeCudaDescriptor_t *) desc_ptr, dst, src);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateRearrangeDescriptor((BangHandle_t) handle, (RearrangeBangDescriptor_t *) desc_ptr, dst, src);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCreateRearrangeDescriptor((AscendHandle_t) handle,
(RearrangeAclnnDescriptor_t *) desc_ptr,
dst,
src);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateRearrangeDescriptor((MacaHandle_t) handle, (RearrangeMacaDescriptor_t *) desc_ptr, dst, src);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *) desc_ptr, dst, src);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuRearrange((RearrangeCpuDescriptor_t) desc, dst, src, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaRearrange((RearrangeCudaDescriptor_t) desc, dst, src, stream);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangRearrange((RearrangeBangDescriptor_t) desc, dst, src, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnRearrange((RearrangeAclnnDescriptor_t) desc,
dst,
src,
stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaRearrange((RearrangeMacaDescriptor_t) desc, dst, src, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaRearrange((RearrangeMusaDescriptor_t) desc, dst, src, stream);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t) desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t) desc);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#include "infiniop/ops/rms_norm.h"
__C infiniopStatus_t infiniopCreateRMSNormDescriptor(
infiniopHandle_t handle,
infiniopRMSNormDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
float epsilon) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateRMSNormDescriptor((CudaHandle_t) handle, (RMSNormCudaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateRMSNormDescriptor((BangHandle_t) handle, (RMSNormBangDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCreateRMSNormDescriptor((AscendHandle_t) handle,
(RMSNormAclnnDescriptor_t *) desc_ptr,
y_desc,
x_desc,
w_desc,
epsilon);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateRMSNormDescriptor((MacaHandle_t) handle, (RMSNormMacaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCreateRMSNormDescriptor((MusaHandle_t) handle, (RMSNormMusaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t) desc, size);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc,
size);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t) desc, size);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, uint64_t workspace_size,
void *y, void const *x, void const *w, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuRMSNorm((RMSNormCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaRMSNorm((RMSNormCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangRMSNorm((RMSNormBangDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnRMSNorm((RMSNormAclnnDescriptor_t) desc,
workspace,
workspace_size,
y,
x,
w,
stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaRMSNorm((RMSNormMacaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaRMSNorm((RMSNormMusaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t) desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t) desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t) desc);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#include "infiniop/ops/rotary_embedding.h"
__C infiniopStatus_t infiniopCreateRoPEDescriptor(
infiniopHandle_t handle, infiniopRoPEDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t t, infiniopTensorDescriptor_t pos_ids,
infiniopTensorDescriptor_t sin_table,
infiniopTensorDescriptor_t cos_table) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateRoPEDescriptor((CpuHandle_t)handle,
(RoPECpuDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateRoPEDescriptor((CudaHandle_t)handle,
(RoPECudaDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateRoPEDescriptor((BangHandle_t)handle,
(RoPEBangDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendCreateRoPEDescriptor((AscendHandle_t)handle,
(RoPEAscendDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateRoPEDescriptor((MacaHandle_t)handle,
(RoPEMacaDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCreateRoPEDescriptor((MusaHandle_t)handle,
(RoPEMusaDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
uint64_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetRoPEWorkspaceSize((RoPECpuDescriptor_t)desc, size);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetRoPEWorkspaceSize((RoPECudaDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetRoPEWorkspaceSize((RoPEBangDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendGetRoPEWorkspaceSize((RoPEAscendDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetRoPEWorkspaceSize((RoPEMacaDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetRoPEWorkspaceSize((RoPEMusaDescriptor_t)desc, size);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopRoPE(infiniopRoPEDescriptor_t desc,
void *workspace, uint64_t workspace_size,
void *t, void const *pos_ids,
void const *sin_table, void const *cos_table,
void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuRoPE((RoPECpuDescriptor_t)desc, workspace, workspace_size, t,
pos_ids, sin_table, cos_table, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaRoPE((RoPECudaDescriptor_t)desc, workspace, workspace_size,
t, pos_ids, sin_table, cos_table, stream);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangRoPE((RoPEBangDescriptor_t)desc, workspace, workspace_size,
t, pos_ids, sin_table, cos_table, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendRoPE((RoPEAscendDescriptor_t)desc, workspace,
workspace_size, t, pos_ids, sin_table, cos_table,
stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaRoPE((RoPEMacaDescriptor_t)desc, workspace, workspace_size,
t, pos_ids, sin_table, cos_table, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaRoPE((RoPEMusaDescriptor_t)desc, workspace, workspace_size,
t, pos_ids, sin_table, cos_table, stream);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t
infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyRoPEDescriptor((RoPECpuDescriptor_t)desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaDestroyRoPEDescriptor((RoPECudaDescriptor_t)desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyRoPEDescriptor((RoPEBangDescriptor_t)desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendDestroyRoPEDescriptor((RoPEAscendDescriptor_t)desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyRoPEDescriptor((RoPEMacaDescriptor_t)desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaDestroyRoPEDescriptor((RoPEMusaDescriptor_t)desc);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#include "infiniop/ops/swiglu.h"
__C infiniopStatus_t infiniopCreateSwiGLUDescriptor(
infiniopHandle_t handle, infiniopSwiGLUDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateSwiGLUDescriptor(
handle, (SwiGLUCpuDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaCreateSwiGLUDescriptor((CudaHandle_t)handle,
(SwiGLUCudaDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateSwiGLUDescriptor((BangHandle_t)handle,
(SwiGLUBangDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return ascendCreateSwiGLUDescriptor(
(AscendHandle_t)handle, (SwiGLUAscendDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateSwiGLUDescriptor((MacaHandle_t)handle,
(SwiGLUMacaDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaCreateSwiGLUDescriptor(
handle, (SwiGLUMusaDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
};
__C infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
void const *a, void const *b,
void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuSwiGLU((SwiGLUCpuDescriptor_t)desc, c, a, b, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaSwiGLU((SwiGLUCudaDescriptor_t)desc, c, a, b, stream);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return ascendSwiGLU((SwiGLUAscendDescriptor_t)desc, c, a, b, stream);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu:
return macaSwiGLU((SwiGLUMacaDescriptor_t)desc, c, a, b, stream);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaSwiGLU((SwiGLUMusaDescriptor_t)desc, c, a, b, stream);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t
infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroySwiGLUDescriptor((SwiGLUCpuDescriptor_t)desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaDestroySwiGLUDescriptor((SwiGLUCudaDescriptor_t)desc);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t)desc);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu:
return macaDestroySwiGLUDescriptor((SwiGLUMacaDescriptor_t)desc);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaDestroySwiGLUDescriptor((SwiGLUMusaDescriptor_t)desc);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
...@@ -13,32 +13,28 @@ ...@@ -13,32 +13,28 @@
#define ROUND_UP_DIV(x, y) ((x + y - 1) / y) #define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
#define CHECK_ERROR(call, target, errCode) \ #define CHECK_ERROR(call, target, errCode) \
do { \ do { \
if (auto value = (call); value == (target)) { \ if (auto value = (call); value == (target)) { \
std::cerr << "Error: expected " << (target) \ std::cerr << "Error: expected " << (target) << " but got " \
<< " but got " << value \ << value << " in file " << __FILE__ << ", function " \
<< " in file " << __FILE__ \ << __func__ << ", line " << __LINE__ << std::endl; \
<< ", function " << __func__ \ return (errCode); \
<< ", line " << __LINE__ << std::endl; \ } \
return (errCode); \
} \
} while (0) } while (0)
#define CREATE_CHECK_ERROR(expr, value, target, errCode) \ #define CREATE_CHECK_ERROR(expr, value, target, errCode) \
expr; \ expr; \
CHECK_ERROR(value, target, errCode) CHECK_ERROR(value, target, errCode)
#define CHECK_STATUS(call, target) \ #define CHECK_STATUS(call, target) \
do { \ do { \
if (auto value = (call); value != (target)) { \ if (auto value = (call); value != (target)) { \
std::cerr << "Error: expected " << (target) \ std::cerr << "Error: expected " << (target) << " but got " \
<< " but got " << value \ << value << " in file " << __FILE__ << ", function " \
<< " in file " << __FILE__ \ << __func__ << ", line " << __LINE__ << std::endl; \
<< ", function " << __func__ \ return value; \
<< ", line " << __LINE__ << std::endl; \ } \
return value; \
} \
} while (0) } while (0)
inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) { inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) {
...@@ -53,8 +49,9 @@ inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) { ...@@ -53,8 +49,9 @@ inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) {
// calculate the broadcasted shape for two tensors // calculate the broadcasted shape for two tensors
inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1, inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
const uint64_t *shape2, uint64_t ndim2, const uint64_t *shape2, uint64_t ndim2,
uint64_t *broadcast_shape, uint64_t *padded_shape1, uint64_t *broadcast_shape,
uint64_t *padded_shape2, uint64_t max_rank) { uint64_t *padded_shape1, uint64_t *padded_shape2,
uint64_t max_rank) {
// prepending and initializing // prepending and initializing
std::fill(padded_shape1, padded_shape1 + max_rank, 1); std::fill(padded_shape1, padded_shape1 + max_rank, 1);
std::fill(padded_shape2, padded_shape2 + max_rank, 1); std::fill(padded_shape2, padded_shape2 + max_rank, 1);
...@@ -63,7 +60,8 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1, ...@@ -63,7 +60,8 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
// compute broadcasted shape // compute broadcasted shape
for (size_t i = 0; i < max_rank; ++i) { for (size_t i = 0; i < max_rank; ++i) {
if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) { if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 ||
padded_shape2[i] == 1) {
broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]); broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]);
} else { } else {
return false; return false;
...@@ -73,31 +71,39 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1, ...@@ -73,31 +71,39 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
return true; return true;
} }
// check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes // check if the shape of tensor c is valid after broadcasting tensors a and b
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c, // and also get the broadcasted shapes
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b,
infiniopTensorDescriptor_t c,
uint64_t broadcast_ndim) { uint64_t broadcast_ndim) {
std::vector<uint64_t> std::vector<uint64_t> broadcast_shape_(broadcast_ndim),
broadcast_shape_(broadcast_ndim), padded_shape1_(broadcast_ndim), padded_shape2_(broadcast_ndim);
padded_shape1_(broadcast_ndim),
padded_shape2_(broadcast_ndim);
auto broadcast_shape = broadcast_shape_.data(), auto broadcast_shape = broadcast_shape_.data(),
padded_shape1 = padded_shape1_.data(), padded_shape1 = padded_shape1_.data(),
padded_shape2 = padded_shape2_.data(); padded_shape2 = padded_shape2_.data();
if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) { if (broadcast_ndim != c->ndim ||
!getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim,
broadcast_shape, padded_shape1, padded_shape2,
broadcast_ndim)) {
return false; return false;
} }
return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim, c->shape); return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim,
c->shape);
} }
// check if the shape of tensor src can be validly broadcasted to that of the tensor dst // check if the shape of tensor src can be validly broadcasted to that of the
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) { // tensor dst
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst,
infiniopTensorDescriptor_t src) {
if (dst->ndim < src->ndim) { if (dst->ndim < src->ndim) {
return false; return false;
} }
std::vector<size_t> padded_shape_(dst->ndim); std::vector<size_t> padded_shape_(dst->ndim);
auto padded_shape = padded_shape_.data(); auto padded_shape = padded_shape_.data();
std::fill(padded_shape, padded_shape + dst->ndim, 1); std::fill(padded_shape, padded_shape + dst->ndim, 1);
std::copy(src->shape, src->shape + src->ndim, padded_shape + dst->ndim - src->ndim); std::copy(src->shape, src->shape + src->ndim,
padded_shape + dst->ndim - src->ndim);
for (size_t i = 0; i < dst->ndim; ++i) { for (size_t i = 0; i < dst->ndim; ++i) {
if (padded_shape[i] != dst->shape[i] && padded_shape[i] != 1) { if (padded_shape[i] != dst->shape[i] && padded_shape[i] != 1) {
return false; return false;
...@@ -107,7 +113,9 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensor ...@@ -107,7 +113,9 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensor
} }
// check if the shape of tensor c is valid after broadcasting tensors a and b // check if the shape of tensor c is valid after broadcasting tensors a and b
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c) { inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b,
infiniopTensorDescriptor_t c) {
return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim)); return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
} }
...@@ -120,7 +128,8 @@ inline size_t get_byte_size(infiniopTensorDescriptor_t desc) { ...@@ -120,7 +128,8 @@ inline size_t get_byte_size(infiniopTensorDescriptor_t desc) {
} }
// permute the dimensions of a tensor descriptor // permute the dimensions of a tensor descriptor
inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const std::vector<size_t> &order) { inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
const std::vector<size_t> &order) {
size_t ndim = desc->ndim; size_t ndim = desc->ndim;
if (order.size() != ndim) { if (order.size() != ndim) {
return nullptr; return nullptr;
...@@ -134,14 +143,16 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const ...@@ -134,14 +143,16 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const
shape[i] = desc->shape[order[i]]; shape[i] = desc->shape[order[i]];
strides[i] = desc->strides[order[i]]; strides[i] = desc->strides[order[i]];
} }
return new InfiniopTensorDescriptor{ return new InfiniopTensorDescriptor{desc->dtype, ndim, shape, strides};
desc->dtype, ndim, shape, strides};
} }
// check if the dimensions [dim_start, dim_end] of a tensor descriptor are contiguous // check if the dimensions [dim_start, dim_end] of a tensor descriptor are
inline bool is_contiguous(const infiniopTensorDescriptor_t &desc, size_t dim_start, size_t dim_end) { // contiguous
inline bool is_contiguous(const infiniopTensorDescriptor_t &desc,
size_t dim_start, size_t dim_end) {
for (size_t i = dim_start + 1; i <= dim_end; i++) { for (size_t i = dim_start + 1; i <= dim_end; i++) {
if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) { if (desc->strides[i - 1] !=
static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
return false; return false;
} }
} }
...@@ -156,7 +167,8 @@ inline bool is_contiguous(const infiniopTensorDescriptor_t &desc) { ...@@ -156,7 +167,8 @@ inline bool is_contiguous(const infiniopTensorDescriptor_t &desc) {
} }
// merge the dimensions [dim_start, dim_end] of a tensor descriptor // merge the dimensions [dim_start, dim_end] of a tensor descriptor
inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, size_t dim_start, size_t dim_end) { inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc,
size_t dim_start, size_t dim_end) {
size_t ndim = desc->ndim; size_t ndim = desc->ndim;
if (dim_start > dim_end || dim_end >= ndim) { if (dim_start > dim_end || dim_end >= ndim) {
return nullptr; return nullptr;
...@@ -185,14 +197,17 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, siz ...@@ -185,14 +197,17 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, siz
new_strides[index] = desc->strides[i]; new_strides[index] = desc->strides[i];
index++; index++;
} }
return new InfiniopTensorDescriptor{ return new InfiniopTensorDescriptor{desc->dtype, new_ndim, new_shape,
desc->dtype, new_ndim, new_shape, new_strides}; new_strides};
} }
// split the dimension dim of a tensor descriptor into multiple dimensions // split the dimension dim of a tensor descriptor into multiple dimensions
inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, size_t dim, const std::vector<size_t> &dims) { inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc,
size_t dim,
const std::vector<size_t> &dims) {
size_t ndim = desc->ndim; size_t ndim = desc->ndim;
if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies{})) { if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1,
std::multiplies{})) {
return nullptr; return nullptr;
} }
size_t new_ndim = ndim + dims.size() - 1; size_t new_ndim = ndim + dims.size() - 1;
...@@ -206,7 +221,10 @@ inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, siz ...@@ -206,7 +221,10 @@ inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, siz
} }
for (size_t i = 0; i < dims.size(); i++) { for (size_t i = 0; i < dims.size(); i++) {
new_shape[index] = dims[i]; new_shape[index] = dims[i];
new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, 1, std::multiplies<size_t>()); new_strides[index] =
desc->strides[dim] * desc->shape[dim] /
std::accumulate(dims.begin(), dims.begin() + i + 1, (size_t)1,
std::multiplies<size_t>());
index++; index++;
} }
for (size_t i = dim + 1; i < ndim; i++) { for (size_t i = dim + 1; i < ndim; i++) {
...@@ -214,8 +232,8 @@ inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, siz ...@@ -214,8 +232,8 @@ inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, siz
new_strides[index] = desc->strides[i]; new_strides[index] = desc->strides[i];
index++; index++;
} }
return new InfiniopTensorDescriptor{ return new InfiniopTensorDescriptor{desc->dtype, new_ndim, new_shape,
desc->dtype, new_ndim, new_shape, new_strides}; new_strides};
} }
#endif// __UTILS_H__ #endif // __UTILS_H__
...@@ -2,6 +2,8 @@ target("infiniop-cpu") ...@@ -2,6 +2,8 @@ target("infiniop-cpu")
on_install(function (target) end) on_install(function (target) end)
set_kind("static") set_kind("static")
add_cxflags("-Wall", "-Werror")
if not is_plat("windows") then if not is_plat("windows") then
add_cxflags("-fPIC") add_cxflags("-fPIC")
end end
......
...@@ -20,10 +20,12 @@ target("infiniop-cuda") ...@@ -20,10 +20,12 @@ target("infiniop-cuda")
if is_plat("windows") then if is_plat("windows") then
add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler") add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
add_cuflags("-Xcompiler=/W3", "-Xcompiler=/WX")
if CUDNN_ROOT ~= nil then if CUDNN_ROOT ~= nil then
add_linkdirs(CUDNN_ROOT .. "\\lib\\x64") add_linkdirs(CUDNN_ROOT .. "\\lib\\x64")
end end
else else
add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror")
add_cuflags("-Xcompiler=-fPIC") add_cuflags("-Xcompiler=-fPIC")
add_culdflags("-Xcompiler=-fPIC") add_culdflags("-Xcompiler=-fPIC")
add_cxxflags("-fPIC") add_cxxflags("-fPIC")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment