Unverified Commit 3c31dc6c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #45 from YdrMaster/main

issue/52 代码格式化:机制和效果
parents 16dad776 e5ed9fa1
...@@ -38,8 +38,8 @@ cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) { ...@@ -38,8 +38,8 @@ cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) {
template <typename Tdata> template <typename Tdata>
infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c, infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
float beta, void const *a, void const *b, float beta, void const *a, void const *b,
float alpha) { float alpha) {
auto info = desc->info; auto info = desc->info;
if (info.is_transed) { if (info.is_transed) {
...@@ -49,20 +49,11 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c, ...@@ -49,20 +49,11 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
for (size_t i = 0; i < info.batch; ++i) { for (size_t i = 0; i < info.batch; ++i) {
for (size_t m_ = 0; m_ < info.m; ++m_) { for (size_t m_ = 0; m_ < info.m; ++m_) {
for (size_t n_ = 0; n_ < info.n; ++n_) { for (size_t n_ = 0; n_ < info.n; ++n_) {
auto c_ = reinterpret_cast<Tdata *>(c) + auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride;
i * info.c_matrix.stride +
m_ * info.c_matrix.row_stride +
n_ * info.c_matrix.col_stride;
float sum = 0; float sum = 0;
for (size_t k_ = 0; k_ < info.k; ++k_) { for (size_t k_ = 0; k_ < info.k; ++k_) {
auto a_ = reinterpret_cast<Tdata const *>(a) + auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride;
i * info.a_matrix.stride + auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride;
m_ * info.a_matrix.row_stride +
k_ * info.a_matrix.col_stride;
auto b_ = reinterpret_cast<Tdata const *>(b) +
i * info.b_matrix.stride +
n_ * info.b_matrix.col_stride +
k_ * info.b_matrix.row_stride;
if constexpr (std::is_same<Tdata, uint16_t>::value) { if constexpr (std::is_same<Tdata, uint16_t>::value) {
sum += f16_to_f32(*a_) * f16_to_f32(*b_); sum += f16_to_f32(*a_) * f16_to_f32(*b_);
} else { } else {
......
#include "./matmul_cuda.cuh"
#include "../../utils.h" #include "../../utils.h"
#include "./matmul_cuda.cuh"
infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle, infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
infiniopMatmulCudaDescriptor_t *desc_ptr, infiniopMatmulCudaDescriptor_t *desc_ptr,
......
#ifndef __INFINIOP_MATMUL_CUDA_H__ #ifndef __INFINIOP_MATMUL_CUDA_H__
#define __INFINIOP_MATMUL_CUDA_H__ #define __INFINIOP_MATMUL_CUDA_H__
#include "matmul_cuda_api.h"
#include "../../../devices/cuda/common_cuda.cuh" #include "../../../devices/cuda/common_cuda.cuh"
#include <memory>
#include "../blas.h" #include "../blas.h"
#include "matmul_cuda_api.h"
#include <memory>
typedef struct InfiniopMatmulCudaDescriptor { typedef struct InfiniopMatmulCudaDescriptor {
infiniDevice_t device; infiniDevice_t device;
...@@ -14,4 +14,4 @@ typedef struct InfiniopMatmulCudaDescriptor { ...@@ -14,4 +14,4 @@ typedef struct InfiniopMatmulCudaDescriptor {
std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool; std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
} InfiniopMatmulCudaDescriptor; } InfiniopMatmulCudaDescriptor;
#endif// __INFINIOP_MATMUL_CUDA_H__ #endif // __INFINIOP_MATMUL_CUDA_H__
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include "../../../devices/cuda/cuda_handle.h" #include "../../../devices/cuda/cuda_handle.h"
#include "infiniop/operator.h" #include "infiniop/operator.h"
struct InfiniopMatmulCudaDescriptor; struct InfiniopMatmulCudaDescriptor;
typedef struct InfiniopMatmulCudaDescriptor *infiniopMatmulCudaDescriptor_t; typedef struct InfiniopMatmulCudaDescriptor *infiniopMatmulCudaDescriptor_t;
...@@ -28,5 +27,4 @@ infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc, ...@@ -28,5 +27,4 @@ infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc); infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc);
#endif // __INFINIOP_MATMUL_CUDA_API_H__ #endif // __INFINIOP_MATMUL_CUDA_API_H__
#include "../../utils.h" #include "../../utils.h"
#include "./matmul_cuda.cuh" #include "./matmul_cuda.cuh"
template<typename Tdata> template <typename Tdata>
infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) { infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
auto info = desc->info; auto info = desc->info;
...@@ -26,7 +26,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c, ...@@ -26,7 +26,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c,
auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
use_cublas(desc->cublas_handle_pool, desc->device_id, (cudaStream_t) stream, use_cublas(desc->cublas_handle_pool, desc->device_id, (cudaStream_t)stream,
[&](cublasHandle_t handle) { cublasGemmStridedBatchedEx( [&](cublasHandle_t handle) { cublasGemmStridedBatchedEx(
handle, handle,
op_a, op_a,
......
...@@ -3,36 +3,36 @@ ...@@ -3,36 +3,36 @@
__C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) { __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) {
switch (handle->device) { switch (handle->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *) desc_ptr, result, probs); return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *)desc_ptr, result, probs);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: case DevNvGpu:
return cudaCreateRandomSampleDescriptor((CudaHandle_t) handle, (RandomSampleCudaDescriptor_t *) desc_ptr, result, probs); return cudaCreateRandomSampleDescriptor((CudaHandle_t)handle, (RandomSampleCudaDescriptor_t *)desc_ptr, result, probs);
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangCreateRandomSampleDescriptor((BangHandle_t) handle, return bangCreateRandomSampleDescriptor((BangHandle_t)handle,
(RandomSampleBangDescriptor_t *) desc_ptr, result, (RandomSampleBangDescriptor_t *)desc_ptr, result,
probs); probs);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle, return ascendCreateRandomSampleDescriptor((AscendHandle_t)handle,
(RandomSampleAscendDescriptor_t *) desc_ptr, result, probs); (RandomSampleAscendDescriptor_t *)desc_ptr, result, probs);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaCreateRandomSampleDescriptor((MacaHandle_t) handle, return macaCreateRandomSampleDescriptor((MacaHandle_t)handle,
(RandomSampleMacaDescriptor_t *) desc_ptr, result, (RandomSampleMacaDescriptor_t *)desc_ptr, result,
probs); probs);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: case DevMthreadsGpu:
return musaCreateRandomSampleDescriptor((MusaHandle_t) handle, (RandomSampleMusaDescriptor_t *) desc_ptr, result, probs); return musaCreateRandomSampleDescriptor((MusaHandle_t)handle, (RandomSampleMusaDescriptor_t *)desc_ptr, result, probs);
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -41,35 +41,35 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl ...@@ -41,35 +41,35 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl
__C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) { __C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t) desc, size); return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t)desc, size);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: { case DevNvGpu: {
return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t) desc, size); return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t)desc, size);
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t) desc, size); return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t)desc, size);
// return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size); // return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size); return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t)desc, size);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t) desc, size); return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t)desc, size);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t) desc, size); return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t)desc, size);
} }
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -87,31 +87,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc, ...@@ -87,31 +87,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
void *stream) { void *stream) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuRandomSample((RandomSampleCpuDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); return cpuRandomSample((RandomSampleCpuDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: case DevNvGpu:
return cudaRandomSample((RandomSampleCudaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); return cudaRandomSample((RandomSampleCudaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangRandomSample((RandomSampleBangDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); return bangRandomSample((RandomSampleBangDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); return ascendRandomSample((RandomSampleAscendDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaRandomSample((RandomSampleMacaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); return macaRandomSample((RandomSampleMacaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: case DevMthreadsGpu:
return musaRandomSample((RandomSampleMusaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream); return musaRandomSample((RandomSampleMusaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -120,31 +120,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc, ...@@ -120,31 +120,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
__C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) { __C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t) desc); return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t)desc);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: case DevNvGpu:
return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t) desc); return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t)desc);
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t) desc); return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc); return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t) desc); return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: case DevMthreadsGpu:
return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t) desc); return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t)desc);
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
...@@ -7,37 +7,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor( ...@@ -7,37 +7,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
infiniopTensorDescriptor_t src) { infiniopTensorDescriptor_t src) {
switch (handle->device) { switch (handle->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *) desc_ptr, dst, src); return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *)desc_ptr, dst, src);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: { case DevNvGpu: {
return cudaCreateRearrangeDescriptor((CudaHandle_t) handle, (RearrangeCudaDescriptor_t *) desc_ptr, dst, src); return cudaCreateRearrangeDescriptor((CudaHandle_t)handle, (RearrangeCudaDescriptor_t *)desc_ptr, dst, src);
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangCreateRearrangeDescriptor((BangHandle_t) handle, (RearrangeBangDescriptor_t *) desc_ptr, dst, src); return bangCreateRearrangeDescriptor((BangHandle_t)handle, (RearrangeBangDescriptor_t *)desc_ptr, dst, src);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return aclnnCreateRearrangeDescriptor((AscendHandle_t) handle, return aclnnCreateRearrangeDescriptor((AscendHandle_t)handle,
(RearrangeAclnnDescriptor_t *) desc_ptr, (RearrangeAclnnDescriptor_t *)desc_ptr,
dst, dst,
src); src);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaCreateRearrangeDescriptor((MacaHandle_t) handle, (RearrangeMacaDescriptor_t *) desc_ptr, dst, src); return macaCreateRearrangeDescriptor((MacaHandle_t)handle, (RearrangeMacaDescriptor_t *)desc_ptr, dst, src);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *) desc_ptr, dst, src); return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *)desc_ptr, dst, src);
} }
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -46,37 +46,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor( ...@@ -46,37 +46,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
__C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream) { __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuRearrange((RearrangeCpuDescriptor_t) desc, dst, src, stream); return cpuRearrange((RearrangeCpuDescriptor_t)desc, dst, src, stream);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: { case DevNvGpu: {
return cudaRearrange((RearrangeCudaDescriptor_t) desc, dst, src, stream); return cudaRearrange((RearrangeCudaDescriptor_t)desc, dst, src, stream);
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangRearrange((RearrangeBangDescriptor_t) desc, dst, src, stream); return bangRearrange((RearrangeBangDescriptor_t)desc, dst, src, stream);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return aclnnRearrange((RearrangeAclnnDescriptor_t) desc, return aclnnRearrange((RearrangeAclnnDescriptor_t)desc,
dst, dst,
src, src,
stream); stream);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaRearrange((RearrangeMacaDescriptor_t) desc, dst, src, stream); return macaRearrange((RearrangeMacaDescriptor_t)desc, dst, src, stream);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaRearrange((RearrangeMusaDescriptor_t) desc, dst, src, stream); return musaRearrange((RearrangeMusaDescriptor_t)desc, dst, src, stream);
} }
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -85,34 +85,34 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void ...@@ -85,34 +85,34 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
__C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc) { __C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t) desc); return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t)desc);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: { case DevNvGpu: {
return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t) desc); return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t) desc); return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc); return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t) desc); return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t) desc); return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t)desc);
} }
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
...@@ -9,38 +9,38 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor( ...@@ -9,38 +9,38 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
float epsilon) { float epsilon) {
switch (handle->device) { switch (handle->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: { case DevNvGpu: {
return cudaCreateRMSNormDescriptor((CudaHandle_t) handle, (RMSNormCudaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); return cudaCreateRMSNormDescriptor((CudaHandle_t)handle, (RMSNormCudaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangCreateRMSNormDescriptor((BangHandle_t) handle, (RMSNormBangDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); return bangCreateRMSNormDescriptor((BangHandle_t)handle, (RMSNormBangDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return aclnnCreateRMSNormDescriptor((AscendHandle_t) handle, return aclnnCreateRMSNormDescriptor((AscendHandle_t)handle,
(RMSNormAclnnDescriptor_t *) desc_ptr, (RMSNormAclnnDescriptor_t *)desc_ptr,
y_desc, y_desc,
x_desc, x_desc,
w_desc, w_desc,
epsilon); epsilon);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaCreateRMSNormDescriptor((MacaHandle_t) handle, (RMSNormMacaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); return macaCreateRMSNormDescriptor((MacaHandle_t)handle, (RMSNormMacaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaCreateRMSNormDescriptor((MusaHandle_t) handle, (RMSNormMusaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon); return musaCreateRMSNormDescriptor((MusaHandle_t)handle, (RMSNormMusaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
} }
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -49,35 +49,35 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor( ...@@ -49,35 +49,35 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
__C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) { __C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t) desc, size); return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t)desc, size);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: { case DevNvGpu: {
return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t) desc, size); return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t)desc, size);
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t) desc, size); return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t)desc, size);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc, return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t)desc,
size); size);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t) desc, size); return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t)desc, size);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t) desc, size); return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t)desc, size);
} }
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -87,40 +87,40 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor ...@@ -87,40 +87,40 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
void *y, void const *x, void const *w, void *stream) { void *y, void const *x, void const *w, void *stream) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuRMSNorm((RMSNormCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); return cpuRMSNorm((RMSNormCpuDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: { case DevNvGpu: {
return cudaRMSNorm((RMSNormCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); return cudaRMSNorm((RMSNormCudaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangRMSNorm((RMSNormBangDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); return bangRMSNorm((RMSNormBangDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return aclnnRMSNorm((RMSNormAclnnDescriptor_t) desc, return aclnnRMSNorm((RMSNormAclnnDescriptor_t)desc,
workspace, workspace,
workspace_size, workspace_size,
y, y,
x, x,
w, w,
stream); stream);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaRMSNorm((RMSNormMacaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); return macaRMSNorm((RMSNormMacaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaRMSNorm((RMSNormMusaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream); return musaRMSNorm((RMSNormMusaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
} }
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -129,34 +129,34 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor ...@@ -129,34 +129,34 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
__C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc) { __C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU #ifdef ENABLE_CPU
case DevCpu: case DevCpu:
return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t) desc); return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t)desc);
#endif #endif
#ifdef ENABLE_NV_GPU #ifdef ENABLE_NV_GPU
case DevNvGpu: { case DevNvGpu: {
return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t) desc); return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t) desc); return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: { case DevAscendNpu: {
return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t) desc); return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t) desc); return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t) desc); return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t)desc);
} }
#endif #endif
} }
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
...@@ -13,28 +13,28 @@ ...@@ -13,28 +13,28 @@
#define ROUND_UP_DIV(x, y) ((x + y - 1) / y) #define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
#define CHECK_ERROR(call, target, errCode) \ #define CHECK_ERROR(call, target, errCode) \
do { \ do { \
if (auto value = (call); value == (target)) { \ if (auto value = (call); value == (target)) { \
std::cerr << "Error: expected " << (target) << " but got " \ std::cerr << "Error: expected " << (target) << " but got " \
<< value << " in file " << __FILE__ << ", function " \ << value << " in file " << __FILE__ << ", function " \
<< __func__ << ", line " << __LINE__ << std::endl; \ << __func__ << ", line " << __LINE__ << std::endl; \
return (errCode); \ return (errCode); \
} \ } \
} while (0) } while (0)
#define CREATE_CHECK_ERROR(expr, value, target, errCode) \ #define CREATE_CHECK_ERROR(expr, value, target, errCode) \
expr; \ expr; \
CHECK_ERROR(value, target, errCode) CHECK_ERROR(value, target, errCode)
#define CHECK_STATUS(call, target) \ #define CHECK_STATUS(call, target) \
do { \ do { \
if (auto value = (call); value != (target)) { \ if (auto value = (call); value != (target)) { \
std::cerr << "Error: expected " << (target) << " but got " \ std::cerr << "Error: expected " << (target) << " but got " \
<< value << " in file " << __FILE__ << ", function " \ << value << " in file " << __FILE__ << ", function " \
<< __func__ << ", line " << __LINE__ << std::endl; \ << __func__ << ", line " << __LINE__ << std::endl; \
return value; \ return value; \
} \ } \
} while (0) } while (0)
inline std::vector<int64_t> getByteStrides(infiniopTensorDescriptor_t desc) { inline std::vector<int64_t> getByteStrides(infiniopTensorDescriptor_t desc) {
...@@ -67,8 +67,7 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1, ...@@ -67,8 +67,7 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
// compute broadcasted shape // compute broadcasted shape
for (size_t i = 0; i < max_rank; ++i) { for (size_t i = 0; i < max_rank; ++i) {
if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) {
padded_shape2[i] == 1) {
broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]); broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]);
} else { } else {
return false; return false;
...@@ -89,10 +88,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, ...@@ -89,10 +88,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
auto broadcast_shape = broadcast_shape_.data(), auto broadcast_shape = broadcast_shape_.data(),
padded_shape1 = padded_shape1_.data(), padded_shape1 = padded_shape1_.data(),
padded_shape2 = padded_shape2_.data(); padded_shape2 = padded_shape2_.data();
if (broadcast_ndim != c->ndim || if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) {
!getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim,
broadcast_shape, padded_shape1, padded_shape2,
broadcast_ndim)) {
return false; return false;
} }
return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim, return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim,
...@@ -126,7 +122,6 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, ...@@ -126,7 +122,6 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim)); return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
} }
// permute the dimensions of a tensor descriptor // permute the dimensions of a tensor descriptor
inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
const std::vector<size_t> &order) { const std::vector<size_t> &order) {
...@@ -149,10 +144,9 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, ...@@ -149,10 +144,9 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
// check if the dimensions [dim_start, dim_end] of a tensor descriptor are // check if the dimensions [dim_start, dim_end] of a tensor descriptor are
// contiguous // contiguous
inline bool isContiguous(const infiniopTensorDescriptor_t &desc, inline bool isContiguous(const infiniopTensorDescriptor_t &desc,
size_t dim_start, size_t dim_end) { size_t dim_start, size_t dim_end) {
for (size_t i = dim_start + 1; i <= dim_end; i++) { for (size_t i = dim_start + 1; i <= dim_end; i++) {
if (desc->strides[i - 1] != if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
return false; return false;
} }
} }
...@@ -168,7 +162,7 @@ inline bool isContiguous(const infiniopTensorDescriptor_t &desc) { ...@@ -168,7 +162,7 @@ inline bool isContiguous(const infiniopTensorDescriptor_t &desc) {
// merge the dimensions [dim_start, dim_end] of a tensor descriptor // merge the dimensions [dim_start, dim_end] of a tensor descriptor
inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc, inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,
size_t dim_start, size_t dim_end) { size_t dim_start, size_t dim_end) {
size_t ndim = desc->ndim; size_t ndim = desc->ndim;
if (dim_start > dim_end || dim_end >= ndim) { if (dim_start > dim_end || dim_end >= ndim) {
return nullptr; return nullptr;
...@@ -203,11 +197,10 @@ inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc, ...@@ -203,11 +197,10 @@ inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,
// split the dimension dim of a tensor descriptor into multiple dimensions // split the dimension dim of a tensor descriptor into multiple dimensions
inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc, inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
size_t dim, size_t dim,
const std::vector<size_t> &dims) { const std::vector<size_t> &dims) {
size_t ndim = desc->ndim; size_t ndim = desc->ndim;
if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1, if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies{})) {
std::multiplies{})) {
return nullptr; return nullptr;
} }
size_t new_ndim = ndim + dims.size() - 1; size_t new_ndim = ndim + dims.size() - 1;
...@@ -221,10 +214,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc, ...@@ -221,10 +214,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
} }
for (size_t i = 0; i < dims.size(); i++) { for (size_t i = 0; i < dims.size(); i++) {
new_shape[index] = dims[i]; new_shape[index] = dims[i];
new_strides[index] = new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, (size_t)1, std::multiplies<size_t>());
desc->strides[dim] * desc->shape[dim] /
std::accumulate(dims.begin(), dims.begin() + i + 1, (size_t)1,
std::multiplies<size_t>());
index++; index++;
} }
for (size_t i = dim + 1; i < ndim; i++) { for (size_t i = dim + 1; i < ndim; i++) {
......
import libinfiniop import libinfiniop
\ No newline at end of file
...@@ -41,8 +41,8 @@ def test( ...@@ -41,8 +41,8 @@ def test(
lib, lib,
handle, handle,
torch_device, torch_device,
c_shape, c_shape,
a_shape, a_shape,
b_shape, b_shape,
tensor_dtype=torch.float16, tensor_dtype=torch.float16,
inplace=Inplace.OUT_OF_PLACE, inplace=Inplace.OUT_OF_PLACE,
...@@ -56,13 +56,21 @@ def test( ...@@ -56,13 +56,21 @@ def test(
a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) c = (
torch.rand(c_shape, dtype=tensor_dtype).to(torch_device)
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
ans = add(a, b) ans = add(a, b)
a_tensor = to_tensor(a, lib) a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib) b_tensor = to_tensor(b, lib)
c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
descriptor = infiniopAddDescriptor_t() descriptor = infiniopAddDescriptor_t()
check_error( check_error(
...@@ -91,8 +99,10 @@ def test_cpu(lib, test_cases): ...@@ -91,8 +99,10 @@ def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device) handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases: for c_shape, a_shape, b_shape, inplace in test_cases:
# fmt: off
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -100,8 +110,10 @@ def test_cuda(lib, test_cases): ...@@ -100,8 +110,10 @@ def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device) handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases: for c_shape, a_shape, b_shape, inplace in test_cases:
# fmt: off
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -111,13 +123,16 @@ def test_bang(lib, test_cases): ...@@ -111,13 +123,16 @@ def test_bang(lib, test_cases):
device = DeviceEnum.DEVICE_BANG device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device) handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases: for c_shape, a_shape, b_shape, inplace in test_cases:
# fmt: off
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
if __name__ == "__main__": if __name__ == "__main__":
test_cases = [ test_cases = [
# fmt: off
# c_shape, a_shape, b_shape, inplace # c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE), # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
...@@ -133,6 +148,7 @@ if __name__ == "__main__": ...@@ -133,6 +148,7 @@ if __name__ == "__main__":
((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
# fmt: on
] ]
args = get_args() args = get_args()
lib = open_lib() lib = open_lib()
......
...@@ -35,7 +35,7 @@ class AvgPoolDescriptor(Structure): ...@@ -35,7 +35,7 @@ class AvgPoolDescriptor(Structure):
infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor) infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
def pool(x, k, padding, stride, dilation = 1): def pool(x, k, padding, stride, dilation=1):
pooling_layers = { pooling_layers = {
1: torch.nn.AvgPool1d, 1: torch.nn.AvgPool1d,
2: torch.nn.AvgPool2d, 2: torch.nn.AvgPool2d,
...@@ -48,7 +48,9 @@ def pool(x, k, padding, stride, dilation = 1): ...@@ -48,7 +48,9 @@ def pool(x, k, padding, stride, dilation = 1):
return None return None
if ndim == 3 and x.dtype == torch.float16: if ndim == 3 and x.dtype == torch.float16:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16) ans = pooling_layers[ndim](k, stride=stride, padding=padding)(
x.to(torch.float32)
).to(torch.float16)
else: else:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x) ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
if PROFILE: if PROFILE:
...@@ -69,18 +71,20 @@ def inferShape(x_shape, kernel_shape, padding, strides): ...@@ -69,18 +71,20 @@ def inferShape(x_shape, kernel_shape, padding, strides):
return x_shape[:2] + tuple(output_shape) return x_shape[:2] + tuple(output_shape)
# convert a python tuple to a ctype void pointer # convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple): def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple) array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple) data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p) return ctypes.cast(data_array, ctypes.c_void_p)
def test( def test(
lib, lib,
handle, handle,
torch_device, torch_device,
x_shape, x_shape,
k_shape, k_shape,
padding, padding,
strides, strides,
tensor_dtype=torch.float16, tensor_dtype=torch.float16,
...@@ -90,7 +94,9 @@ def test( ...@@ -90,7 +94,9 @@ def test(
) )
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device) y = torch.rand(
inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1): for i in range(NUM_PRERUN if PROFILE else 1):
ans = pool(x, k_shape, padding, strides) ans = pool(x, k_shape, padding, strides)
...@@ -126,7 +132,9 @@ def test( ...@@ -126,7 +132,9 @@ def test(
check_error( check_error(
lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
) )
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1): for i in range(NUM_PRERUN if PROFILE else 1):
...@@ -164,8 +172,10 @@ def test_cpu(lib, test_cases): ...@@ -164,8 +172,10 @@ def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device) handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases: for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -173,8 +183,10 @@ def test_cuda(lib, test_cases): ...@@ -173,8 +183,10 @@ def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device) handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases: for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -184,17 +196,21 @@ def test_bang(lib, test_cases): ...@@ -184,17 +196,21 @@ def test_bang(lib, test_cases):
device = DeviceEnum.DEVICE_BANG device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device) handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases: for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16) test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32) test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
if __name__ == "__main__": if __name__ == "__main__":
test_cases = [ test_cases = [
# fmt: off
# x_shape, kernel_shape, padding, strides # x_shape, kernel_shape, padding, strides
((1, 1, 10), (3,), (1,), (1,)), ((1, 1, 10), (3,), (1,), (1,)),
((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)), ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)), ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
# fmt: on
] ]
args = get_args() args = get_args()
lib = open_lib() lib = open_lib()
......
...@@ -101,6 +101,7 @@ def test_bang(lib, test_cases): ...@@ -101,6 +101,7 @@ def test_bang(lib, test_cases):
test(lib, handle, "mlu", x_shape, x_stride) test(lib, handle, "mlu", x_shape, x_stride)
destroy_handle(lib, handle) destroy_handle(lib, handle)
def test_ascend(lib, test_cases): def test_ascend(lib, test_cases):
import torch_npu import torch_npu
...@@ -111,11 +112,12 @@ def test_ascend(lib, test_cases): ...@@ -111,11 +112,12 @@ def test_ascend(lib, test_cases):
destroy_handle(lib, handle) destroy_handle(lib, handle)
if __name__ == "__main__": if __name__ == "__main__":
test_cases = [ test_cases = [
# x_shape, x_stride # x_shape, x_stride
((32, 20, 512), None), ((32, 20, 512), None),
((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续 ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
] ]
args = get_args() args = get_args()
lib = open_lib() lib = open_lib()
......
...@@ -41,17 +41,11 @@ infiniopConvDescriptor_t = POINTER(ConvDescriptor) ...@@ -41,17 +41,11 @@ infiniopConvDescriptor_t = POINTER(ConvDescriptor)
def conv(x, w, stride, padding, dilation): def conv(x, w, stride, padding, dilation):
match len(x.shape) - 2: match len(x.shape) - 2:
case 1: case 1:
return F.conv1d( return F.conv1d(x, w, stride=stride, padding=padding, dilation=dilation)
x, w, stride=stride, padding=padding, dilation=dilation
)
case 2: case 2:
return F.conv2d( return F.conv2d(x, w, stride=stride, padding=padding, dilation=dilation)
x, w, stride=stride, padding=padding, dilation=dilation
)
case 3: case 3:
return F.conv3d( return F.conv3d(x, w, stride=stride, padding=padding, dilation=dilation)
x, w, stride=stride, padding=padding, dilation=dilation
)
case _: case _:
print("Error: Pytorch -> Unsupported tensor dimension") print("Error: Pytorch -> Unsupported tensor dimension")
return None return None
...@@ -66,11 +60,15 @@ def inferShape( ...@@ -66,11 +60,15 @@ def inferShape(
dilations: List[int], dilations: List[int],
) -> Tuple[int, ...]: ) -> Tuple[int, ...]:
assert ( assert (
len(x_shape) == len(w_shape) == len(pads) + 2 == len(dilations) + 2 == len(strides) + 2 len(x_shape)
== len(w_shape)
== len(pads) + 2
== len(dilations) + 2
== len(strides) + 2
), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2" ), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
output_dims = [ output_dims = [
math.floor( math.floor(
(x_shape[i+2] + 2 * pads[i] - dilations[i] * (w_shape[i+2] - 1) - 1) (x_shape[i + 2] + 2 * pads[i] - dilations[i] * (w_shape[i + 2] - 1) - 1)
/ strides[i] / strides[i]
+ 1 + 1
) )
...@@ -145,7 +143,9 @@ def test( ...@@ -145,7 +143,9 @@ def test(
check_error( check_error(
lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
) )
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1): for i in range(NUM_PRERUN if PROFILE else 1):
...@@ -177,7 +177,7 @@ def test( ...@@ -177,7 +177,7 @@ def test(
elapsed = (time.time() - start_time) / NUM_ITERATIONS elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}") print(f" lib time: {elapsed :6f}")
if (tensor_dtype == torch.float16): if tensor_dtype == torch.float16:
assert torch.allclose(y, ans, atol=0, rtol=1e-2) assert torch.allclose(y, ans, atol=0, rtol=1e-2)
else: else:
assert torch.allclose(y, ans, atol=0, rtol=1e-3) assert torch.allclose(y, ans, atol=0, rtol=1e-3)
...@@ -188,8 +188,10 @@ def test_cpu(lib, test_cases): ...@@ -188,8 +188,10 @@ def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device) handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases: for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
# fmt: off
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16) test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32) test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -197,8 +199,10 @@ def test_cuda(lib, test_cases): ...@@ -197,8 +199,10 @@ def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device) handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases: for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
# fmt: off
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16) test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32) test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -208,8 +212,10 @@ def test_bang(lib, test_cases): ...@@ -208,8 +212,10 @@ def test_bang(lib, test_cases):
device = DeviceEnum.DEVICE_BANG device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device) handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases: for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
# fmt: off
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16) test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32) test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
......
...@@ -47,10 +47,10 @@ def test( ...@@ -47,10 +47,10 @@ def test(
lib, lib,
handle, handle,
torch_device, torch_device,
y_shape, y_shape,
x_shape, x_shape,
y_stride=None, y_stride=None,
x_stride=None, x_stride=None,
tensor_dtype=torch.float16, tensor_dtype=torch.float16,
): ):
print( print(
...@@ -109,8 +109,10 @@ def test_cpu(lib, test_cases): ...@@ -109,8 +109,10 @@ def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device) handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases: for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -118,8 +120,10 @@ def test_cuda(lib, test_cases): ...@@ -118,8 +120,10 @@ def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device) handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases: for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -129,13 +133,16 @@ def test_bang(lib, test_cases): ...@@ -129,13 +133,16 @@ def test_bang(lib, test_cases):
device = DeviceEnum.DEVICE_BANG device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device) handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases: for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16) test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32) test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
if __name__ == "__main__": if __name__ == "__main__":
test_cases = [ test_cases = [
# fmt: off
# y_shape, x_shape, y_stride, x_stride # y_shape, x_shape, y_stride, x_stride
((), (), None, None), ((), (), None, None),
((3, 3), (1,), None, None), ((3, 3), (1,), None, None),
...@@ -146,6 +153,7 @@ if __name__ == "__main__": ...@@ -146,6 +153,7 @@ if __name__ == "__main__":
((2, 3, 4, 5), (5,), None, None), ((2, 3, 4, 5), (5,), None, None),
((3, 2, 4, 5), (3, 2, 1, 1), None, None), ((3, 2, 4, 5), (3, 2, 1, 1), None, None),
((32, 256, 112, 112), (32, 256, 112, 1), None, None), ((32, 256, 112, 112), (32, 256, 112, 1), None, None),
# fmt: on
] ]
args = get_args() args = get_args()
lib = open_lib() lib = open_lib()
......
...@@ -27,6 +27,7 @@ PROFILE = False ...@@ -27,6 +27,7 @@ PROFILE = False
NUM_PRERUN = 10 NUM_PRERUN = 10
NUM_ITERATIONS = 1000 NUM_ITERATIONS = 1000
class GEMMDescriptor(Structure): class GEMMDescriptor(Structure):
_fields_ = [("device", c_int32)] _fields_ = [("device", c_int32)]
...@@ -34,10 +35,15 @@ class GEMMDescriptor(Structure): ...@@ -34,10 +35,15 @@ class GEMMDescriptor(Structure):
infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor) infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)
def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32): def gemm(
A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32
):
A = A.T if transA else A A = A.T if transA else A
B = B.T if transB else B B = B.T if transB else B
result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype) result = alpha * torch.matmul(
A if dtype != torch.float16 else A.to(torch.float32),
B if dtype != torch.float16 else B.to(torch.float32),
).to(dtype)
if C is not None: if C is not None:
result += beta * C if dtype != torch.float16 else C.to(torch.float32) result += beta * C if dtype != torch.float16 else C.to(torch.float32)
if PROFILE: if PROFILE:
...@@ -64,7 +70,7 @@ def test( ...@@ -64,7 +70,7 @@ def test(
dtype=torch.float16, dtype=torch.float16,
): ):
print( print(
f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} " f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} "
f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} " f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}" f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
) )
...@@ -121,9 +127,7 @@ def test( ...@@ -121,9 +127,7 @@ def test(
workspace_size = ctypes.c_uint64(0) workspace_size = ctypes.c_uint64(0)
check_error( check_error(
lib.infiniopGetGEMMWorkspaceSize( lib.infiniopGetGEMMWorkspaceSize(descriptor, ctypes.byref(workspace_size))
descriptor, ctypes.byref(workspace_size)
)
) )
workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to( workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
torch_device torch_device
...@@ -182,8 +186,10 @@ def test_cpu(lib, test_cases): ...@@ -182,8 +186,10 @@ def test_cpu(lib, test_cases):
c_stride, c_stride,
y_stride, y_stride,
) in test_cases: ) in test_cases:
# fmt: off
test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -204,8 +210,10 @@ def test_cuda(lib, test_cases): ...@@ -204,8 +210,10 @@ def test_cuda(lib, test_cases):
c_stride, c_stride,
y_stride, y_stride,
) in test_cases: ) in test_cases:
# fmt: off
test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -229,9 +237,10 @@ def test_bang(lib, test_cases): ...@@ -229,9 +237,10 @@ def test_bang(lib, test_cases):
c_stride, c_stride,
y_stride, y_stride,
) in test_cases: ) in test_cases:
# fmt: off
test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16) test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32) test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle) destroy_handle(lib, handle)
......
...@@ -99,7 +99,12 @@ def test( ...@@ -99,7 +99,12 @@ def test(
for i in range(NUM_PRERUN if PROFILE else 1): for i in range(NUM_PRERUN if PROFILE else 1):
check_error( check_error(
lib.infiniopGlobalAvgPool( lib.infiniopGlobalAvgPool(
descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
) )
) )
if PROFILE: if PROFILE:
......
import os import os
import sys import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
from .liboperators import open_lib, CTensor, infiniopHandle_t, infiniopTensorDescriptor_t sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
from .liboperators import (
open_lib,
CTensor,
infiniopHandle_t,
infiniopTensorDescriptor_t,
)
from .devices import * from .devices import *
from .utils import * from .utils import *
from .datatypes import * from .datatypes import *
...@@ -7,7 +7,7 @@ class InfiniDtype: ...@@ -7,7 +7,7 @@ class InfiniDtype:
I32 = 5 I32 = 5
I64 = 6 I64 = 6
U8 = 7 U8 = 7
U16 = 8 U16 = 8
U32 = 9 U32 = 9
U64 = 10 U64 = 10
F8 = 11 F8 = 11
......
...@@ -54,6 +54,7 @@ def create_workspace(size, torch_device): ...@@ -54,6 +54,7 @@ def create_workspace(size, torch_device):
if size == 0: if size == 0:
return None return None
import torch import torch
return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device) return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)
...@@ -172,6 +173,7 @@ def get_args(): ...@@ -172,6 +173,7 @@ def get_args():
def synchronize_device(torch_device): def synchronize_device(torch_device):
import torch import torch
if torch_device == "cuda": if torch_device == "cuda":
torch.cuda.synchronize() torch.cuda.synchronize()
elif torch_device == "npu": elif torch_device == "npu":
...@@ -197,13 +199,24 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True): ...@@ -197,13 +199,24 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
If True, the function will print detailed information about any discrepancies between the tensors. If True, the function will print detailed information about any discrepancies between the tensors.
""" """
import numpy as np import numpy as np
print_discrepancy(actual, desired, atol, rtol, verbose) print_discrepancy(actual, desired, atol, rtol, verbose)
np.testing.assert_allclose(actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True, strict=True) np.testing.assert_allclose(
actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True, strict=True
)
def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, atol=0, rtol=1e-2, equal_nan=False, verbose=True): def debug_all(
actual_vals: Sequence,
desired_vals: Sequence,
condition: str,
atol=0,
rtol=1e-2,
equal_nan=False,
verbose=True,
):
""" """
Debugging function to compare two sequences of values (actual and desired) pair by pair, results Debugging function to compare two sequences of values (actual and desired) pair by pair, results
are linked by the given logical condition, and prints discrepancies are linked by the given logical condition, and prints discrepancies
Arguments: Arguments:
---------- ----------
...@@ -223,7 +236,10 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato ...@@ -223,7 +236,10 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
- AssertionError: If the specified `condition` is not 'or' or 'and'. - AssertionError: If the specified `condition` is not 'or' or 'and'.
""" """
assert len(actual_vals) == len(desired_vals), "Invalid Length" assert len(actual_vals) == len(desired_vals), "Invalid Length"
assert condition in {"or", "and"}, "Invalid condition: should be either 'or' or 'and'" assert condition in {
"or",
"and",
}, "Invalid condition: should be either 'or' or 'and'"
import numpy as np import numpy as np
passed = False if condition == "or" else True passed = False if condition == "or" else True
...@@ -237,14 +253,22 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato ...@@ -237,14 +253,22 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
elif condition == "and": elif condition == "and":
if passed and len(indices) != 0: if passed and len(indices) != 0:
passed = False passed = False
print(f"\033[31mThe condition has not been satisfied: Condition #{index + 1}\033[0m") print(
np.testing.assert_allclose(actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True, strict=True) f"\033[31mThe condition has not been satisfied: Condition #{index + 1}\033[0m"
)
np.testing.assert_allclose(
actual.cpu(),
desired.cpu(),
rtol,
atol,
equal_nan,
verbose=True,
strict=True,
)
assert passed, "\033[31mThe condition has not been satisfied\033[0m" assert passed, "\033[31mThe condition has not been satisfied\033[0m"
def print_discrepancy( def print_discrepancy(actual, expected, atol=0, rtol=1e-3, verbose=True):
actual, expected, atol=0, rtol=1e-3, verbose=True
):
if actual.shape != expected.shape: if actual.shape != expected.shape:
raise ValueError("Tensors must have the same shape to compare.") raise ValueError("Tensors must have the same shape to compare.")
...@@ -273,7 +297,9 @@ def print_discrepancy( ...@@ -273,7 +297,9 @@ def print_discrepancy(
for idx in diff_indices: for idx in diff_indices:
index_tuple = tuple(idx.tolist()) index_tuple = tuple(idx.tolist())
actual_str = f"{actual[index_tuple]:<{col_width[1]}.{decimal_places[1]}f}" actual_str = f"{actual[index_tuple]:<{col_width[1]}.{decimal_places[1]}f}"
expected_str = f"{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}" expected_str = (
f"{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}"
)
delta_str = f"{delta[index_tuple]:<{col_width[3]}.{decimal_places[3]}f}" delta_str = f"{delta[index_tuple]:<{col_width[3]}.{decimal_places[3]}f}"
print( print(
f" > Index: {str(index_tuple):<{col_width[0]}}" f" > Index: {str(index_tuple):<{col_width[0]}}"
...@@ -287,10 +313,18 @@ def print_discrepancy( ...@@ -287,10 +313,18 @@ def print_discrepancy(
print(f" - Desired dtype: {expected.dtype}") print(f" - Desired dtype: {expected.dtype}")
print(f" - Atol: {atol}") print(f" - Atol: {atol}")
print(f" - Rtol: {rtol}") print(f" - Rtol: {rtol}")
print(f" - Mismatched elements: {len(diff_indices)} / {actual.numel()} ({len(diff_indices) / actual.numel() * 100}%)") print(
print(f" - Min(actual) : {torch.min(actual):<{col_width[1]}} | Max(actual) : {torch.max(actual):<{col_width[2]}}") f" - Mismatched elements: {len(diff_indices)} / {actual.numel()} ({len(diff_indices) / actual.numel() * 100}%)"
print(f" - Min(desired): {torch.min(expected):<{col_width[1]}} | Max(desired): {torch.max(expected):<{col_width[2]}}") )
print(f" - Min(delta) : {torch.min(delta):<{col_width[1]}} | Max(delta) : {torch.max(delta):<{col_width[2]}}") print(
f" - Min(actual) : {torch.min(actual):<{col_width[1]}} | Max(actual) : {torch.max(actual):<{col_width[2]}}"
)
print(
f" - Min(desired): {torch.min(expected):<{col_width[1]}} | Max(desired): {torch.max(expected):<{col_width[2]}}"
)
print(
f" - Min(delta) : {torch.min(delta):<{col_width[1]}} | Max(delta) : {torch.max(delta):<{col_width[2]}}"
)
print("-" * total_width + "\n") print("-" * total_width + "\n")
return diff_indices return diff_indices
...@@ -298,14 +332,17 @@ def print_discrepancy( ...@@ -298,14 +332,17 @@ def print_discrepancy(
def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3): def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3):
""" """
Returns the atol and rtol for a given tensor data type in the tolerance_map. Returns the atol and rtol for a given tensor data type in the tolerance_map.
If the given data type is not found, it returns the provided default tolerance values. If the given data type is not found, it returns the provided default tolerance values.
""" """
return tolerance_map.get(tensor_dtype, {'atol': default_atol, 'rtol': default_rtol}).values() return tolerance_map.get(
tensor_dtype, {"atol": default_atol, "rtol": default_rtol}
).values()
def timed_op(func, num_iterations, device): def timed_op(func, num_iterations, device):
import time import time
""" Function for timing operations with synchronization. """ """ Function for timing operations with synchronization. """
synchronize_device(device) synchronize_device(device)
start = time.time() start = time.time()
...@@ -318,7 +355,7 @@ def timed_op(func, num_iterations, device): ...@@ -318,7 +355,7 @@ def timed_op(func, num_iterations, device):
def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS): def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
""" """
Unified profiling workflow that is used to profile the execution time of a given function. Unified profiling workflow that is used to profile the execution time of a given function.
It first performs a number of warmup runs, then performs timed execution and It first performs a number of warmup runs, then performs timed execution and
prints the average execution time. prints the average execution time.
Arguments: Arguments:
...@@ -328,11 +365,11 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS): ...@@ -328,11 +365,11 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
- torch_device (str): The device on which the operation runs, provided for timed execution. - torch_device (str): The device on which the operation runs, provided for timed execution.
- NUM_PRERUN (int): The number of warmup runs. - NUM_PRERUN (int): The number of warmup runs.
- NUM_ITERATIONS (int): The number of timed execution iterations, used to calculate the average execution time. - NUM_ITERATIONS (int): The number of timed execution iterations, used to calculate the average execution time.
""" """
# Warmup runs # Warmup runs
for _ in range(NUM_PRERUN): for _ in range(NUM_PRERUN):
func() func()
# Timed execution # Timed execution
elapsed = timed_op(lambda: func(), NUM_ITERATIONS, torch_device) elapsed = timed_op(lambda: func(), NUM_ITERATIONS, torch_device)
print(f" {desc} time: {elapsed * 1000 :6f} ms") print(f" {desc} time: {elapsed * 1000 :6f} ms")
...@@ -347,7 +384,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes): ...@@ -347,7 +384,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
- lib (ctypes.CDLL): The library object containing the operator implementations. - lib (ctypes.CDLL): The library object containing the operator implementations.
- device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py. - device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py.
- test_func (function): The test function to be executed for each test case. - test_func (function): The test function to be executed for each test case.
- test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters - test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
to be passed to `test_func`. to be passed to `test_func`.
- tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test. - tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
""" """
...@@ -355,7 +392,13 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes): ...@@ -355,7 +392,13 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
try: try:
for test_case in test_cases: for test_case in test_cases:
for tensor_dtype in tensor_dtypes: for tensor_dtype in tensor_dtypes:
test_func(lib, handle, infiniDeviceEnum_str_map[device], *test_case, tensor_dtype) test_func(
lib,
handle,
infiniDeviceEnum_str_map[device],
*test_case,
tensor_dtype,
)
finally: finally:
destroy_handle(lib, handle) destroy_handle(lib, handle)
...@@ -365,22 +408,26 @@ def get_test_devices(args): ...@@ -365,22 +408,26 @@ def get_test_devices(args):
Using the given parsed Namespace to determine the devices to be tested. Using the given parsed Namespace to determine the devices to be tested.
Argument: Argument:
- args: the parsed Namespace object. - args: the parsed Namespace object.
Return: Return:
- devices_to_test: the devices that will be tested. Default is CPU. - devices_to_test: the devices that will be tested. Default is CPU.
""" """
devices_to_test = [] devices_to_test = []
if args.cpu: devices_to_test.append(InfiniDeviceEnum.CPU) if args.cpu:
if args.nvidia: devices_to_test.append(InfiniDeviceEnum.NVIDIA) devices_to_test.append(InfiniDeviceEnum.CPU)
if args.cambricon: if args.nvidia:
devices_to_test.append(InfiniDeviceEnum.NVIDIA)
if args.cambricon:
import torch_mlu import torch_mlu
devices_to_test.append(InfiniDeviceEnum.CAMBRICON) devices_to_test.append(InfiniDeviceEnum.CAMBRICON)
if args.ascend: if args.ascend:
import torch import torch
import torch_npu import torch_npu
torch.npu.set_device(0) # Ascend NPU needs explicit device initialization
torch.npu.set_device(0) # Ascend NPU needs explicit device initialization
devices_to_test.append(InfiniDeviceEnum.ASCEND) devices_to_test.append(InfiniDeviceEnum.ASCEND)
if not devices_to_test: if not devices_to_test:
devices_to_test = [InfiniDeviceEnum.CPU] devices_to_test = [InfiniDeviceEnum.CPU]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment