Commit 7a833987 authored by PanZezhongQY's avatar PanZezhongQY
Browse files

feat: 添加大模型算子operator.cc,cpu和cuda视warning为错误

parent e3ea5bae
......@@ -52,14 +52,14 @@ uint16_t f32_to_f16(float val) {
// Infinity
return sign | 0x7C00;
} else if (exponent >= -14) {// Normalized case
return sign | ((exponent + 15) << 10) | (mantissa >> 13);
return (uint16_t)(sign | ((exponent + 15) << 10) | (mantissa >> 13));
} else if (exponent >= -24) {
mantissa |= 0x800000;// Add implicit leading 1
mantissa >>= (-14 - exponent);
return sign | (mantissa >> 13);
return (uint16_t)(sign | (mantissa >> 13));
} else {
// Too small for subnormal: return signed zero
return sign;
return (uint16_t)sign;
}
}
......
#include "infiniop/ops/causal_softmax.h"
__C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
infiniopHandle_t handle,
infiniopCausalSoftmaxDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateCausalSoftmaxDescriptor(handle, (CausalSoftmaxCpuDescriptor_t *) desc_ptr, y_desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxBangDescriptor_t *) desc_ptr, y_desc);
// return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t) handle, (CausalSoftmaxAclnnDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateCausalSoftmaxDescriptor((MacaHandle_t) handle, (CausalSoftmaxMacaDescriptor_t *) desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCreateCausalSoftmaxDescriptor((MusaHandle_t) handle, (CausalSoftmaxMusaDescriptor_t *) desc_ptr, y_desc);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, uint64_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t) desc, size);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCudaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t) desc, size);
// return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t) desc, size);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *data, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t) desc, workspace, workspace_size, data, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCausalSoftmax((CausalSoftmaxCudaDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t) desc, workspace, workspace_size, data, stream);
// return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t) desc, workspace, workspace_size, data, stream);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t) desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaDestroyCausalSoftmaxDescriptor((CausalSoftmaxCudaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t) desc);
// return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t) desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t) desc);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -47,8 +47,8 @@ typedef struct BlasMatrix {
*status = INFINIOP_STATUS_SUCCESS;
}
bool match_batch(size_t batch) const {
return this->batch == batch || this->batch == 1;
bool match_batch(size_t _batch) const {
return this->batch == _batch || this->batch == 1;
}
void transpose() {
......@@ -56,7 +56,7 @@ typedef struct BlasMatrix {
std::swap(row_stride, col_stride);
}
int ld() const {
int64_t ld() const {
if (this->row_stride == 1) {
return this->col_stride;
} else {
......
......@@ -31,24 +31,24 @@ infiniopStatus_t matmul_cuda(infiniopMatmulCudaDescriptor_t desc, void *c, float
handle,
op_a,
op_b,
info.m,
info.n,
info.k,
static_cast<int>(info.m),
static_cast<int>(info.n),
static_cast<int>(info.k),
&alpha,
a,
a_type,
info.a_matrix.ld(),
static_cast<int>(info.a_matrix.ld()),
info.a_matrix.stride,
b,
b_type,
info.b_matrix.ld(),
static_cast<int>(info.b_matrix.ld()),
info.b_matrix.stride,
&beta,
c,
c_type,
info.c_matrix.ld(),
static_cast<int>(info.c_matrix.ld()),
info.c_matrix.stride,
info.batch,
static_cast<int>(info.batch),
compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP); });
return INFINIOP_STATUS_SUCCESS;
......
#include "../utils.h"
#include "infiniop/ops/matmul.h"
#ifdef ENABLE_CPU_API
......
#include "infiniop/ops/random_sample.h"
__C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *) desc_ptr, result, probs);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaCreateRandomSampleDescriptor((CudaHandle_t) handle, (RandomSampleCudaDescriptor_t *) desc_ptr, result, probs);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateRandomSampleDescriptor((BangHandle_t) handle,
(RandomSampleBangDescriptor_t *) desc_ptr, result,
probs);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle,
(RandomSampleAscendDescriptor_t *) desc_ptr, result, probs);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateRandomSampleDescriptor((MacaHandle_t) handle,
(RandomSampleMacaDescriptor_t *) desc_ptr, result,
probs);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaCreateRandomSampleDescriptor((MusaHandle_t) handle, (RandomSampleMusaDescriptor_t *) desc_ptr, result, probs);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
};
__C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t) desc, size);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t) desc, size);
// return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t) desc, size);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *result,
void const *probs,
float random_val,
float topp,
int topk,
float temperature,
void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuRandomSample((RandomSampleCpuDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaRandomSample((RandomSampleCudaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangRandomSample((RandomSampleBangDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaRandomSample((RandomSampleMacaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaRandomSample((RandomSampleMusaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t) desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t) desc);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t) desc);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#include "infiniop/ops/rearrange.h"
__C infiniopStatus_t infiniopCreateRearrangeDescriptor(
infiniopHandle_t handle,
infiniopRearrangeDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t dst,
infiniopTensorDescriptor_t src) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *) desc_ptr, dst, src);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateRearrangeDescriptor((CudaHandle_t) handle, (RearrangeCudaDescriptor_t *) desc_ptr, dst, src);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateRearrangeDescriptor((BangHandle_t) handle, (RearrangeBangDescriptor_t *) desc_ptr, dst, src);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCreateRearrangeDescriptor((AscendHandle_t) handle,
(RearrangeAclnnDescriptor_t *) desc_ptr,
dst,
src);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateRearrangeDescriptor((MacaHandle_t) handle, (RearrangeMacaDescriptor_t *) desc_ptr, dst, src);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *) desc_ptr, dst, src);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuRearrange((RearrangeCpuDescriptor_t) desc, dst, src, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaRearrange((RearrangeCudaDescriptor_t) desc, dst, src, stream);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangRearrange((RearrangeBangDescriptor_t) desc, dst, src, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnRearrange((RearrangeAclnnDescriptor_t) desc,
dst,
src,
stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaRearrange((RearrangeMacaDescriptor_t) desc, dst, src, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaRearrange((RearrangeMusaDescriptor_t) desc, dst, src, stream);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t) desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t) desc);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#include "infiniop/ops/rms_norm.h"
__C infiniopStatus_t infiniopCreateRMSNormDescriptor(
infiniopHandle_t handle,
infiniopRMSNormDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
float epsilon) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateRMSNormDescriptor((CudaHandle_t) handle, (RMSNormCudaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateRMSNormDescriptor((BangHandle_t) handle, (RMSNormBangDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCreateRMSNormDescriptor((AscendHandle_t) handle,
(RMSNormAclnnDescriptor_t *) desc_ptr,
y_desc,
x_desc,
w_desc,
epsilon);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateRMSNormDescriptor((MacaHandle_t) handle, (RMSNormMacaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCreateRMSNormDescriptor((MusaHandle_t) handle, (RMSNormMusaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t) desc, size);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc,
size);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t) desc, size);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, uint64_t workspace_size,
void *y, void const *x, void const *w, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuRMSNorm((RMSNormCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaRMSNorm((RMSNormCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangRMSNorm((RMSNormBangDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnRMSNorm((RMSNormAclnnDescriptor_t) desc,
workspace,
workspace_size,
y,
x,
w,
stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaRMSNorm((RMSNormMacaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaRMSNorm((RMSNormMusaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t) desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t) desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t) desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t) desc);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#include "infiniop/ops/rotary_embedding.h"
__C infiniopStatus_t infiniopCreateRoPEDescriptor(
infiniopHandle_t handle, infiniopRoPEDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t t, infiniopTensorDescriptor_t pos_ids,
infiniopTensorDescriptor_t sin_table,
infiniopTensorDescriptor_t cos_table) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateRoPEDescriptor((CpuHandle_t)handle,
(RoPECpuDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaCreateRoPEDescriptor((CudaHandle_t)handle,
(RoPECudaDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateRoPEDescriptor((BangHandle_t)handle,
(RoPEBangDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendCreateRoPEDescriptor((AscendHandle_t)handle,
(RoPEAscendDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateRoPEDescriptor((MacaHandle_t)handle,
(RoPEMacaDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaCreateRoPEDescriptor((MusaHandle_t)handle,
(RoPEMusaDescriptor_t *)desc_ptr, t,
pos_ids, sin_table, cos_table);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
uint64_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetRoPEWorkspaceSize((RoPECpuDescriptor_t)desc, size);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaGetRoPEWorkspaceSize((RoPECudaDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetRoPEWorkspaceSize((RoPEBangDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendGetRoPEWorkspaceSize((RoPEAscendDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetRoPEWorkspaceSize((RoPEMacaDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetRoPEWorkspaceSize((RoPEMusaDescriptor_t)desc, size);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopRoPE(infiniopRoPEDescriptor_t desc,
void *workspace, uint64_t workspace_size,
void *t, void const *pos_ids,
void const *sin_table, void const *cos_table,
void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuRoPE((RoPECpuDescriptor_t)desc, workspace, workspace_size, t,
pos_ids, sin_table, cos_table, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaRoPE((RoPECudaDescriptor_t)desc, workspace, workspace_size,
t, pos_ids, sin_table, cos_table, stream);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangRoPE((RoPEBangDescriptor_t)desc, workspace, workspace_size,
t, pos_ids, sin_table, cos_table, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendRoPE((RoPEAscendDescriptor_t)desc, workspace,
workspace_size, t, pos_ids, sin_table, cos_table,
stream);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaRoPE((RoPEMacaDescriptor_t)desc, workspace, workspace_size,
t, pos_ids, sin_table, cos_table, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaRoPE((RoPEMusaDescriptor_t)desc, workspace, workspace_size,
t, pos_ids, sin_table, cos_table, stream);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t
infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyRoPEDescriptor((RoPECpuDescriptor_t)desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu: {
return cudaDestroyRoPEDescriptor((RoPECudaDescriptor_t)desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyRoPEDescriptor((RoPEBangDescriptor_t)desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return ascendDestroyRoPEDescriptor((RoPEAscendDescriptor_t)desc);
}
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyRoPEDescriptor((RoPEMacaDescriptor_t)desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaDestroyRoPEDescriptor((RoPEMusaDescriptor_t)desc);
}
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#include "infiniop/ops/swiglu.h"
__C infiniopStatus_t infiniopCreateSwiGLUDescriptor(
infiniopHandle_t handle, infiniopSwiGLUDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateSwiGLUDescriptor(
handle, (SwiGLUCpuDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaCreateSwiGLUDescriptor((CudaHandle_t)handle,
(SwiGLUCudaDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateSwiGLUDescriptor((BangHandle_t)handle,
(SwiGLUBangDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return ascendCreateSwiGLUDescriptor(
(AscendHandle_t)handle, (SwiGLUAscendDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateSwiGLUDescriptor((MacaHandle_t)handle,
(SwiGLUMacaDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaCreateSwiGLUDescriptor(
handle, (SwiGLUMusaDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
};
__C infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
void const *a, void const *b,
void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuSwiGLU((SwiGLUCpuDescriptor_t)desc, c, a, b, stream);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaSwiGLU((SwiGLUCudaDescriptor_t)desc, c, a, b, stream);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return ascendSwiGLU((SwiGLUAscendDescriptor_t)desc, c, a, b, stream);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu:
return macaSwiGLU((SwiGLUMacaDescriptor_t)desc, c, a, b, stream);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaSwiGLU((SwiGLUMusaDescriptor_t)desc, c, a, b, stream);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t
infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroySwiGLUDescriptor((SwiGLUCpuDescriptor_t)desc);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaDestroySwiGLUDescriptor((SwiGLUCudaDescriptor_t)desc);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t)desc);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu:
return macaDestroySwiGLUDescriptor((SwiGLUMacaDescriptor_t)desc);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu:
return musaDestroySwiGLUDescriptor((SwiGLUMusaDescriptor_t)desc);
#endif
}
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -13,32 +13,28 @@
#define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
#define CHECK_ERROR(call, target, errCode) \
do { \
if (auto value = (call); value == (target)) { \
std::cerr << "Error: expected " << (target) \
<< " but got " << value \
<< " in file " << __FILE__ \
<< ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return (errCode); \
} \
#define CHECK_ERROR(call, target, errCode) \
do { \
if (auto value = (call); value == (target)) { \
std::cerr << "Error: expected " << (target) << " but got " \
<< value << " in file " << __FILE__ << ", function " \
<< __func__ << ", line " << __LINE__ << std::endl; \
return (errCode); \
} \
} while (0)
#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
expr; \
#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
expr; \
CHECK_ERROR(value, target, errCode)
#define CHECK_STATUS(call, target) \
do { \
if (auto value = (call); value != (target)) { \
std::cerr << "Error: expected " << (target) \
<< " but got " << value \
<< " in file " << __FILE__ \
<< ", function " << __func__ \
<< ", line " << __LINE__ << std::endl; \
return value; \
} \
#define CHECK_STATUS(call, target) \
do { \
if (auto value = (call); value != (target)) { \
std::cerr << "Error: expected " << (target) << " but got " \
<< value << " in file " << __FILE__ << ", function " \
<< __func__ << ", line " << __LINE__ << std::endl; \
return value; \
} \
} while (0)
inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) {
......@@ -53,8 +49,9 @@ inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) {
// calculate the broadcasted shape for two tensors
inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
const uint64_t *shape2, uint64_t ndim2,
uint64_t *broadcast_shape, uint64_t *padded_shape1,
uint64_t *padded_shape2, uint64_t max_rank) {
uint64_t *broadcast_shape,
uint64_t *padded_shape1, uint64_t *padded_shape2,
uint64_t max_rank) {
// prepending and initializing
std::fill(padded_shape1, padded_shape1 + max_rank, 1);
std::fill(padded_shape2, padded_shape2 + max_rank, 1);
......@@ -63,7 +60,8 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
// compute broadcasted shape
for (size_t i = 0; i < max_rank; ++i) {
if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) {
if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 ||
padded_shape2[i] == 1) {
broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]);
} else {
return false;
......@@ -73,31 +71,39 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
return true;
}
// check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c,
// check if the shape of tensor c is valid after broadcasting tensors a and b
// and also get the broadcasted shapes
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b,
infiniopTensorDescriptor_t c,
uint64_t broadcast_ndim) {
std::vector<uint64_t>
broadcast_shape_(broadcast_ndim),
padded_shape1_(broadcast_ndim),
padded_shape2_(broadcast_ndim);
std::vector<uint64_t> broadcast_shape_(broadcast_ndim),
padded_shape1_(broadcast_ndim), padded_shape2_(broadcast_ndim);
auto broadcast_shape = broadcast_shape_.data(),
padded_shape1 = padded_shape1_.data(),
padded_shape2 = padded_shape2_.data();
if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) {
if (broadcast_ndim != c->ndim ||
!getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim,
broadcast_shape, padded_shape1, padded_shape2,
broadcast_ndim)) {
return false;
}
return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim, c->shape);
return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim,
c->shape);
}
// check if the shape of tensor src can be validly broadcasted to that of the tensor dst
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) {
// check if the shape of tensor src can be validly broadcasted to that of the
// tensor dst
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst,
infiniopTensorDescriptor_t src) {
if (dst->ndim < src->ndim) {
return false;
}
std::vector<size_t> padded_shape_(dst->ndim);
auto padded_shape = padded_shape_.data();
std::fill(padded_shape, padded_shape + dst->ndim, 1);
std::copy(src->shape, src->shape + src->ndim, padded_shape + dst->ndim - src->ndim);
std::copy(src->shape, src->shape + src->ndim,
padded_shape + dst->ndim - src->ndim);
for (size_t i = 0; i < dst->ndim; ++i) {
if (padded_shape[i] != dst->shape[i] && padded_shape[i] != 1) {
return false;
......@@ -107,7 +113,9 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensor
}
// check if the shape of tensor c is valid after broadcasting tensors a and b
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c) {
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b,
infiniopTensorDescriptor_t c) {
return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
}
......@@ -120,7 +128,8 @@ inline size_t get_byte_size(infiniopTensorDescriptor_t desc) {
}
// permute the dimensions of a tensor descriptor
inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const std::vector<size_t> &order) {
inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
const std::vector<size_t> &order) {
size_t ndim = desc->ndim;
if (order.size() != ndim) {
return nullptr;
......@@ -134,14 +143,16 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const
shape[i] = desc->shape[order[i]];
strides[i] = desc->strides[order[i]];
}
return new InfiniopTensorDescriptor{
desc->dtype, ndim, shape, strides};
return new InfiniopTensorDescriptor{desc->dtype, ndim, shape, strides};
}
// check if the dimensions [dim_start, dim_end] of a tensor descriptor are contiguous
inline bool is_contiguous(const infiniopTensorDescriptor_t &desc, size_t dim_start, size_t dim_end) {
// check if the dimensions [dim_start, dim_end] of a tensor descriptor are
// contiguous
inline bool is_contiguous(const infiniopTensorDescriptor_t &desc,
size_t dim_start, size_t dim_end) {
for (size_t i = dim_start + 1; i <= dim_end; i++) {
if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
if (desc->strides[i - 1] !=
static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
return false;
}
}
......@@ -156,7 +167,8 @@ inline bool is_contiguous(const infiniopTensorDescriptor_t &desc) {
}
// merge the dimensions [dim_start, dim_end] of a tensor descriptor
inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, size_t dim_start, size_t dim_end) {
inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc,
size_t dim_start, size_t dim_end) {
size_t ndim = desc->ndim;
if (dim_start > dim_end || dim_end >= ndim) {
return nullptr;
......@@ -185,14 +197,17 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, siz
new_strides[index] = desc->strides[i];
index++;
}
return new InfiniopTensorDescriptor{
desc->dtype, new_ndim, new_shape, new_strides};
return new InfiniopTensorDescriptor{desc->dtype, new_ndim, new_shape,
new_strides};
}
// split the dimension dim of a tensor descriptor into multiple dimensions
inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, size_t dim, const std::vector<size_t> &dims) {
inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc,
size_t dim,
const std::vector<size_t> &dims) {
size_t ndim = desc->ndim;
if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies{})) {
if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1,
std::multiplies{})) {
return nullptr;
}
size_t new_ndim = ndim + dims.size() - 1;
......@@ -206,7 +221,10 @@ inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, siz
}
for (size_t i = 0; i < dims.size(); i++) {
new_shape[index] = dims[i];
new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, 1, std::multiplies<size_t>());
new_strides[index] =
desc->strides[dim] * desc->shape[dim] /
std::accumulate(dims.begin(), dims.begin() + i + 1, (size_t)1,
std::multiplies<size_t>());
index++;
}
for (size_t i = dim + 1; i < ndim; i++) {
......@@ -214,8 +232,8 @@ inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, siz
new_strides[index] = desc->strides[i];
index++;
}
return new InfiniopTensorDescriptor{
desc->dtype, new_ndim, new_shape, new_strides};
return new InfiniopTensorDescriptor{desc->dtype, new_ndim, new_shape,
new_strides};
}
#endif// __UTILS_H__
#endif // __UTILS_H__
......@@ -2,6 +2,8 @@ target("infiniop-cpu")
on_install(function (target) end)
set_kind("static")
add_cxflags("-Wall", "-Werror")
if not is_plat("windows") then
add_cxflags("-fPIC")
end
......
......@@ -20,10 +20,12 @@ target("infiniop-cuda")
if is_plat("windows") then
add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
add_cuflags("-Xcompiler=/W3", "-Xcompiler=/WX")
if CUDNN_ROOT ~= nil then
add_linkdirs(CUDNN_ROOT .. "\\lib\\x64")
end
else
add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror")
add_cuflags("-Xcompiler=-fPIC")
add_culdflags("-Xcompiler=-fPIC")
add_cxxflags("-fPIC")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment