Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
3c31dc6c
"dockerfile/cuda12.9.dockerfile" did not exist on "a3c65b2a57582d1bbbfd88924e38e20f1597a051"
Unverified
Commit
3c31dc6c
authored
Feb 18, 2025
by
PanZezhong1725
Committed by
GitHub
Feb 18, 2025
Browse files
Merge pull request #45 from YdrMaster/main
issue/52 代码格式化:机制和效果
parents
16dad776
e5ed9fa1
Changes
48
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
424 additions
and
330 deletions
+424
-330
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+5
-14
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+1
-1
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+3
-3
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
+0
-2
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+2
-2
src/infiniop/ops/random_sample/operator.cc
src/infiniop/ops/random_sample/operator.cc
+68
-68
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+57
-57
src/infiniop/ops/rms_norm/operator.cc
src/infiniop/ops/rms_norm/operator.cc
+80
-80
src/infiniop/ops/utils.h
src/infiniop/ops/utils.h
+27
-37
test/infiniop/__init__.py
test/infiniop/__init__.py
+1
-1
test/infiniop/add.py
test/infiniop/add.py
+20
-4
test/infiniop/avg_pool.py
test/infiniop/avg_pool.py
+22
-6
test/infiniop/causal_softmax.py
test/infiniop/causal_softmax.py
+3
-1
test/infiniop/conv.py
test/infiniop/conv.py
+19
-13
test/infiniop/expand.py
test/infiniop/expand.py
+11
-3
test/infiniop/gemm.py
test/infiniop/gemm.py
+16
-7
test/infiniop/global_avg_pool.py
test/infiniop/global_avg_pool.py
+6
-1
test/infiniop/libinfiniop/__init__.py
test/infiniop/libinfiniop/__init__.py
+8
-2
test/infiniop/libinfiniop/datatypes.py
test/infiniop/libinfiniop/datatypes.py
+1
-1
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+74
-27
No files found.
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
View file @
3c31dc6c
...
...
@@ -38,8 +38,8 @@ cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) {
template
<
typename
Tdata
>
infiniopStatus_t
cpuCalculateMatmul
(
infiniopMatmulCpuDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
)
{
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
)
{
auto
info
=
desc
->
info
;
if
(
info
.
is_transed
)
{
...
...
@@ -49,20 +49,11 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
for
(
size_t
i
=
0
;
i
<
info
.
batch
;
++
i
)
{
for
(
size_t
m_
=
0
;
m_
<
info
.
m
;
++
m_
)
{
for
(
size_t
n_
=
0
;
n_
<
info
.
n
;
++
n_
)
{
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
float
sum
=
0
;
for
(
size_t
k_
=
0
;
k_
<
info
.
k
;
++
k_
)
{
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
i
*
info
.
a_matrix
.
stride
+
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
i
*
info
.
a_matrix
.
stride
+
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
if
constexpr
(
std
::
is_same
<
Tdata
,
uint16_t
>::
value
)
{
sum
+=
f16_to_f32
(
*
a_
)
*
f16_to_f32
(
*
b_
);
}
else
{
...
...
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
View file @
3c31dc6c
#include "./matmul_cuda.cuh"
#include "../../utils.h"
#include "./matmul_cuda.cuh"
infiniopStatus_t
cudaCreateMatmulDescriptor
(
infiniopCudaHandle_t
handle
,
infiniopMatmulCudaDescriptor_t
*
desc_ptr
,
...
...
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
View file @
3c31dc6c
#ifndef __INFINIOP_MATMUL_CUDA_H__
#define __INFINIOP_MATMUL_CUDA_H__
#include "matmul_cuda_api.h"
#include "../../../devices/cuda/common_cuda.cuh"
#include <memory>
#include "../blas.h"
#include "matmul_cuda_api.h"
#include <memory>
typedef
struct
InfiniopMatmulCudaDescriptor
{
infiniDevice_t
device
;
...
...
@@ -14,4 +14,4 @@ typedef struct InfiniopMatmulCudaDescriptor {
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
cublas_handle_pool
;
}
InfiniopMatmulCudaDescriptor
;
#endif// __INFINIOP_MATMUL_CUDA_H__
#endif
// __INFINIOP_MATMUL_CUDA_H__
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
View file @
3c31dc6c
...
...
@@ -4,7 +4,6 @@
#include "../../../devices/cuda/cuda_handle.h"
#include "infiniop/operator.h"
struct
InfiniopMatmulCudaDescriptor
;
typedef
struct
InfiniopMatmulCudaDescriptor
*
infiniopMatmulCudaDescriptor_t
;
...
...
@@ -28,5 +27,4 @@ infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
infiniopStatus_t
cudaDestroyMatmulDescriptor
(
infiniopMatmulCudaDescriptor_t
desc
);
#endif // __INFINIOP_MATMUL_CUDA_API_H__
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
View file @
3c31dc6c
#include "../../utils.h"
#include "./matmul_cuda.cuh"
template
<
typename
Tdata
>
template
<
typename
Tdata
>
infiniopStatus_t
cudaMatmulCublas
(
infiniopMatmulCudaDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
void
*
stream
)
{
auto
info
=
desc
->
info
;
...
...
@@ -26,7 +26,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c,
auto
op_a
=
info
.
a_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_b
=
info
.
b_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
use_cublas
(
desc
->
cublas_handle_pool
,
desc
->
device_id
,
(
cudaStream_t
)
stream
,
use_cublas
(
desc
->
cublas_handle_pool
,
desc
->
device_id
,
(
cudaStream_t
)
stream
,
[
&
](
cublasHandle_t
handle
)
{
cublasGemmStridedBatchedEx
(
handle
,
op_a
,
...
...
src/infiniop/ops/random_sample/operator.cc
View file @
3c31dc6c
...
...
@@ -3,36 +3,36 @@
__C
infiniopStatus_t
infiniopCreateRandomSampleDescriptor
(
infiniopHandle_t
handle
,
infiniopRandomSampleDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
result
,
infiniopTensorDescriptor_t
probs
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuCreateRandomSampleDescriptor
(
handle
,
(
RandomSampleCpuDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
case
DevCpu
:
return
cpuCreateRandomSampleDescriptor
(
handle
,
(
RandomSampleCpuDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
return
cudaCreateRandomSampleDescriptor
((
CudaHandle_t
)
handle
,
(
RandomSampleCudaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
case
DevNvGpu
:
return
cudaCreateRandomSampleDescriptor
((
CudaHandle_t
)
handle
,
(
RandomSampleCudaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangCreateRandomSampleDescriptor
((
BangHandle_t
)
handle
,
(
RandomSampleBangDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
case
DevCambriconMlu
:
{
return
bangCreateRandomSampleDescriptor
((
BangHandle_t
)
handle
,
(
RandomSampleBangDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendCreateRandomSampleDescriptor
((
AscendHandle_t
)
handle
,
(
RandomSampleAscendDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
case
DevAscendNpu
:
{
return
ascendCreateRandomSampleDescriptor
((
AscendHandle_t
)
handle
,
(
RandomSampleAscendDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateRandomSampleDescriptor
((
MacaHandle_t
)
handle
,
(
RandomSampleMacaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
case
DevMetaxGpu
:
{
return
macaCreateRandomSampleDescriptor
((
MacaHandle_t
)
handle
,
(
RandomSampleMacaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaCreateRandomSampleDescriptor
((
MusaHandle_t
)
handle
,
(
RandomSampleMusaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
case
DevMthreadsGpu
:
return
musaCreateRandomSampleDescriptor
((
MusaHandle_t
)
handle
,
(
RandomSampleMusaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -41,35 +41,35 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl
__C
infiniopStatus_t
infiniopGetRandomSampleWorkspaceSize
(
infiniopRandomSampleDescriptor_t
desc
,
uint64_t
*
size
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuGetRandomSampleWorkspaceSize
((
RandomSampleCpuDescriptor_t
)
desc
,
size
);
case
DevCpu
:
return
cpuGetRandomSampleWorkspaceSize
((
RandomSampleCpuDescriptor_t
)
desc
,
size
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaGetRandomSampleWorkspaceSize
((
RandomSampleCudaDescriptor_t
)
desc
,
size
);
}
case
DevNvGpu
:
{
return
cudaGetRandomSampleWorkspaceSize
((
RandomSampleCudaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangGetRandomSampleWorkspaceSize
((
RandomSampleBangDescriptor_t
)
desc
,
size
);
// return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
}
case
DevCambriconMlu
:
{
return
bangGetRandomSampleWorkspaceSize
((
RandomSampleBangDescriptor_t
)
desc
,
size
);
// return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendGetRandomSampleWorkspaceSize
((
RandomSampleAscendDescriptor_t
)
desc
,
size
);
}
case
DevAscendNpu
:
{
return
ascendGetRandomSampleWorkspaceSize
((
RandomSampleAscendDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaGetRandomSampleWorkspaceSize
((
RandomSampleMacaDescriptor_t
)
desc
,
size
);
}
case
DevMetaxGpu
:
{
return
macaGetRandomSampleWorkspaceSize
((
RandomSampleMacaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaGetRandomSampleWorkspaceSize
((
RandomSampleMusaDescriptor_t
)
desc
,
size
);
}
case
DevMthreadsGpu
:
{
return
musaGetRandomSampleWorkspaceSize
((
RandomSampleMusaDescriptor_t
)
desc
,
size
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -87,31 +87,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
void
*
stream
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuRandomSample
((
RandomSampleCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
case
DevCpu
:
return
cpuRandomSample
((
RandomSampleCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
return
cudaRandomSample
((
RandomSampleCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
case
DevNvGpu
:
return
cudaRandomSample
((
RandomSampleCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangRandomSample
((
RandomSampleBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
case
DevCambriconMlu
:
{
return
bangRandomSample
((
RandomSampleBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendRandomSample
((
RandomSampleAscendDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
case
DevAscendNpu
:
{
return
ascendRandomSample
((
RandomSampleAscendDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaRandomSample
((
RandomSampleMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
case
DevMetaxGpu
:
{
return
macaRandomSample
((
RandomSampleMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaRandomSample
((
RandomSampleMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
case
DevMthreadsGpu
:
return
musaRandomSample
((
RandomSampleMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -120,31 +120,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
__C
infiniopStatus_t
infiniopDestroyRandomSampleDescriptor
(
infiniopRandomSampleDescriptor_t
desc
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuDestroyRandomSampleDescriptor
((
RandomSampleCpuDescriptor_t
)
desc
);
case
DevCpu
:
return
cpuDestroyRandomSampleDescriptor
((
RandomSampleCpuDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
return
cudaDestroyRandomSampleDescriptor
((
RandomSampleCudaDescriptor_t
)
desc
);
case
DevNvGpu
:
return
cudaDestroyRandomSampleDescriptor
((
RandomSampleCudaDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangDestroyRandomSampleDescriptor
((
RandomSampleBangDescriptor_t
)
desc
);
}
case
DevCambriconMlu
:
{
return
bangDestroyRandomSampleDescriptor
((
RandomSampleBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendDestroyRandomSampleDescriptor
((
RandomSampleAscendDescriptor_t
)
desc
);
}
case
DevAscendNpu
:
{
return
ascendDestroyRandomSampleDescriptor
((
RandomSampleAscendDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaDestroyRandomSampleDescriptor
((
RandomSampleMacaDescriptor_t
)
desc
);
}
case
DevMetaxGpu
:
{
return
macaDestroyRandomSampleDescriptor
((
RandomSampleMacaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaDestroyRandomSampleDescriptor
((
RandomSampleMusaDescriptor_t
)
desc
);
case
DevMthreadsGpu
:
return
musaDestroyRandomSampleDescriptor
((
RandomSampleMusaDescriptor_t
)
desc
);
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/rearrange/operator.cc
View file @
3c31dc6c
...
...
@@ -7,37 +7,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
infiniopTensorDescriptor_t
src
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuCreateRearrangeDescriptor
(
handle
,
(
RearrangeCpuDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
case
DevCpu
:
return
cpuCreateRearrangeDescriptor
(
handle
,
(
RearrangeCpuDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaCreateRearrangeDescriptor
((
CudaHandle_t
)
handle
,
(
RearrangeCudaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
case
DevNvGpu
:
{
return
cudaCreateRearrangeDescriptor
((
CudaHandle_t
)
handle
,
(
RearrangeCudaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangCreateRearrangeDescriptor
((
BangHandle_t
)
handle
,
(
RearrangeBangDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
case
DevCambriconMlu
:
{
return
bangCreateRearrangeDescriptor
((
BangHandle_t
)
handle
,
(
RearrangeBangDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnCreateRearrangeDescriptor
((
AscendHandle_t
)
handle
,
(
RearrangeAclnnDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
case
DevAscendNpu
:
{
return
aclnnCreateRearrangeDescriptor
((
AscendHandle_t
)
handle
,
(
RearrangeAclnnDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateRearrangeDescriptor
((
MacaHandle_t
)
handle
,
(
RearrangeMacaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
case
DevMetaxGpu
:
{
return
macaCreateRearrangeDescriptor
((
MacaHandle_t
)
handle
,
(
RearrangeMacaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaCreateRearrangeDescriptor
((
MusaHandle_t
)
handle
,
(
RearrangeMusaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
case
DevMthreadsGpu
:
{
return
musaCreateRearrangeDescriptor
((
MusaHandle_t
)
handle
,
(
RearrangeMusaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -46,37 +46,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
__C
infiniopStatus_t
infiniopRearrange
(
infiniopRearrangeDescriptor_t
desc
,
void
*
dst
,
void
const
*
src
,
void
*
stream
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuRearrange
((
RearrangeCpuDescriptor_t
)
desc
,
dst
,
src
,
stream
);
case
DevCpu
:
return
cpuRearrange
((
RearrangeCpuDescriptor_t
)
desc
,
dst
,
src
,
stream
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaRearrange
((
RearrangeCudaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
case
DevNvGpu
:
{
return
cudaRearrange
((
RearrangeCudaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangRearrange
((
RearrangeBangDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
case
DevCambriconMlu
:
{
return
bangRearrange
((
RearrangeBangDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnRearrange
((
RearrangeAclnnDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
case
DevAscendNpu
:
{
return
aclnnRearrange
((
RearrangeAclnnDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaRearrange
((
RearrangeMacaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
case
DevMetaxGpu
:
{
return
macaRearrange
((
RearrangeMacaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaRearrange
((
RearrangeMusaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
case
DevMthreadsGpu
:
{
return
musaRearrange
((
RearrangeMusaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -85,34 +85,34 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
__C
infiniopStatus_t
infiniopDestroyRearrangeDescriptor
(
infiniopRearrangeDescriptor_t
desc
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuDestroyRearrangeDescriptor
((
RearrangeCpuDescriptor_t
)
desc
);
case
DevCpu
:
return
cpuDestroyRearrangeDescriptor
((
RearrangeCpuDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaDestroyRearrangeDescriptor
((
RearrangeCudaDescriptor_t
)
desc
);
}
case
DevNvGpu
:
{
return
cudaDestroyRearrangeDescriptor
((
RearrangeCudaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangDestroyRearrangeDescriptor
((
RearrangeBangDescriptor_t
)
desc
);
}
case
DevCambriconMlu
:
{
return
bangDestroyRearrangeDescriptor
((
RearrangeBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnDestroyRearrangeDescriptor
((
RearrangeAclnnDescriptor_t
)
desc
);
}
case
DevAscendNpu
:
{
return
aclnnDestroyRearrangeDescriptor
((
RearrangeAclnnDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaDestroyRearrangeDescriptor
((
RearrangeMacaDescriptor_t
)
desc
);
}
case
DevMetaxGpu
:
{
return
macaDestroyRearrangeDescriptor
((
RearrangeMacaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaDestroyRearrangeDescriptor
((
RearrangeMusaDescriptor_t
)
desc
);
}
case
DevMthreadsGpu
:
{
return
musaDestroyRearrangeDescriptor
((
RearrangeMusaDescriptor_t
)
desc
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/rms_norm/operator.cc
View file @
3c31dc6c
...
...
@@ -9,38 +9,38 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
float
epsilon
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuCreateRMSNormDescriptor
(
handle
,
(
RMSNormCpuDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
case
DevCpu
:
return
cpuCreateRMSNormDescriptor
(
handle
,
(
RMSNormCpuDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaCreateRMSNormDescriptor
((
CudaHandle_t
)
handle
,
(
RMSNormCudaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
case
DevNvGpu
:
{
return
cudaCreateRMSNormDescriptor
((
CudaHandle_t
)
handle
,
(
RMSNormCudaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangCreateRMSNormDescriptor
((
BangHandle_t
)
handle
,
(
RMSNormBangDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
case
DevCambriconMlu
:
{
return
bangCreateRMSNormDescriptor
((
BangHandle_t
)
handle
,
(
RMSNormBangDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnCreateRMSNormDescriptor
((
AscendHandle_t
)
handle
,
(
RMSNormAclnnDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
case
DevAscendNpu
:
{
return
aclnnCreateRMSNormDescriptor
((
AscendHandle_t
)
handle
,
(
RMSNormAclnnDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateRMSNormDescriptor
((
MacaHandle_t
)
handle
,
(
RMSNormMacaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
case
DevMetaxGpu
:
{
return
macaCreateRMSNormDescriptor
((
MacaHandle_t
)
handle
,
(
RMSNormMacaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaCreateRMSNormDescriptor
((
MusaHandle_t
)
handle
,
(
RMSNormMusaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
case
DevMthreadsGpu
:
{
return
musaCreateRMSNormDescriptor
((
MusaHandle_t
)
handle
,
(
RMSNormMusaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -49,35 +49,35 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
__C
infiniopStatus_t
infiniopGetRMSNormWorkspaceSize
(
infiniopRMSNormDescriptor_t
desc
,
uint64_t
*
size
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuGetRMSNormWorkspaceSize
((
RMSNormCpuDescriptor_t
)
desc
,
size
);
case
DevCpu
:
return
cpuGetRMSNormWorkspaceSize
((
RMSNormCpuDescriptor_t
)
desc
,
size
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaGetRMSNormWorkspaceSize
((
RMSNormCudaDescriptor_t
)
desc
,
size
);
}
case
DevNvGpu
:
{
return
cudaGetRMSNormWorkspaceSize
((
RMSNormCudaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangGetRMSNormWorkspaceSize
((
RMSNormBangDescriptor_t
)
desc
,
size
);
}
case
DevCambriconMlu
:
{
return
bangGetRMSNormWorkspaceSize
((
RMSNormBangDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnGetRMSNormWorkspaceSize
((
RMSNormAclnnDescriptor_t
)
desc
,
size
);
}
case
DevAscendNpu
:
{
return
aclnnGetRMSNormWorkspaceSize
((
RMSNormAclnnDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaGetRMSNormWorkspaceSize
((
RMSNormMacaDescriptor_t
)
desc
,
size
);
}
case
DevMetaxGpu
:
{
return
macaGetRMSNormWorkspaceSize
((
RMSNormMacaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaGetRMSNormWorkspaceSize
((
RMSNormMusaDescriptor_t
)
desc
,
size
);
}
case
DevMthreadsGpu
:
{
return
musaGetRMSNormWorkspaceSize
((
RMSNormMusaDescriptor_t
)
desc
,
size
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -87,40 +87,40 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
void
*
y
,
void
const
*
x
,
void
const
*
w
,
void
*
stream
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuRMSNorm
((
RMSNormCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
case
DevCpu
:
return
cpuRMSNorm
((
RMSNormCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaRMSNorm
((
RMSNormCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
case
DevNvGpu
:
{
return
cudaRMSNorm
((
RMSNormCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangRMSNorm
((
RMSNormBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
case
DevCambriconMlu
:
{
return
bangRMSNorm
((
RMSNormBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnRMSNorm
((
RMSNormAclnnDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
case
DevAscendNpu
:
{
return
aclnnRMSNorm
((
RMSNormAclnnDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaRMSNorm
((
RMSNormMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
case
DevMetaxGpu
:
{
return
macaRMSNorm
((
RMSNormMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaRMSNorm
((
RMSNormMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
case
DevMthreadsGpu
:
{
return
musaRMSNorm
((
RMSNormMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -129,34 +129,34 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
__C
infiniopStatus_t
infiniopDestroyRMSNormDescriptor
(
infiniopRMSNormDescriptor_t
desc
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuDestroyRMSNormDescriptor
((
RMSNormCpuDescriptor_t
)
desc
);
case
DevCpu
:
return
cpuDestroyRMSNormDescriptor
((
RMSNormCpuDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaDestroyRMSNormDescriptor
((
RMSNormCudaDescriptor_t
)
desc
);
}
case
DevNvGpu
:
{
return
cudaDestroyRMSNormDescriptor
((
RMSNormCudaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangDestroyRMSNormDescriptor
((
RMSNormBangDescriptor_t
)
desc
);
}
case
DevCambriconMlu
:
{
return
bangDestroyRMSNormDescriptor
((
RMSNormBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnDestroyRMSNormDescriptor
((
RMSNormAclnnDescriptor_t
)
desc
);
}
case
DevAscendNpu
:
{
return
aclnnDestroyRMSNormDescriptor
((
RMSNormAclnnDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaDestroyRMSNormDescriptor
((
RMSNormMacaDescriptor_t
)
desc
);
}
case
DevMetaxGpu
:
{
return
macaDestroyRMSNormDescriptor
((
RMSNormMacaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaDestroyRMSNormDescriptor
((
RMSNormMusaDescriptor_t
)
desc
);
}
case
DevMthreadsGpu
:
{
return
musaDestroyRMSNormDescriptor
((
RMSNormMusaDescriptor_t
)
desc
);
}
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/utils.h
View file @
3c31dc6c
...
...
@@ -13,28 +13,28 @@
#define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
#define CHECK_ERROR(call, target, errCode)
\
do {
\
if (auto value = (call); value == (target)) {
\
std::cerr << "Error: expected " << (target) << " but got "
\
<< value << " in file " << __FILE__ << ", function "
\
<< __func__ << ", line " << __LINE__ << std::endl;
\
return (errCode);
\
}
\
#define CHECK_ERROR(call, target, errCode) \
do { \
if (auto value = (call); value == (target)) { \
std::cerr << "Error: expected " << (target) << " but got " \
<< value << " in file " << __FILE__ << ", function " \
<< __func__ << ", line " << __LINE__ << std::endl; \
return (errCode); \
} \
} while (0)
#define CREATE_CHECK_ERROR(expr, value, target, errCode)
\
expr;
\
#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
expr; \
CHECK_ERROR(value, target, errCode)
#define CHECK_STATUS(call, target)
\
do {
\
if (auto value = (call); value != (target)) {
\
std::cerr << "Error: expected " << (target) << " but got "
\
<< value << " in file " << __FILE__ << ", function "
\
<< __func__ << ", line " << __LINE__ << std::endl;
\
return value;
\
}
\
#define CHECK_STATUS(call, target) \
do { \
if (auto value = (call); value != (target)) { \
std::cerr << "Error: expected " << (target) << " but got " \
<< value << " in file " << __FILE__ << ", function " \
<< __func__ << ", line " << __LINE__ << std::endl; \
return value; \
} \
} while (0)
inline
std
::
vector
<
int64_t
>
getByteStrides
(
infiniopTensorDescriptor_t
desc
)
{
...
...
@@ -67,8 +67,7 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
// compute broadcasted shape
for
(
size_t
i
=
0
;
i
<
max_rank
;
++
i
)
{
if
(
padded_shape1
[
i
]
==
padded_shape2
[
i
]
||
padded_shape1
[
i
]
==
1
||
padded_shape2
[
i
]
==
1
)
{
if
(
padded_shape1
[
i
]
==
padded_shape2
[
i
]
||
padded_shape1
[
i
]
==
1
||
padded_shape2
[
i
]
==
1
)
{
broadcast_shape
[
i
]
=
std
::
max
(
padded_shape1
[
i
],
padded_shape2
[
i
]);
}
else
{
return
false
;
...
...
@@ -89,10 +88,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
auto
broadcast_shape
=
broadcast_shape_
.
data
(),
padded_shape1
=
padded_shape1_
.
data
(),
padded_shape2
=
padded_shape2_
.
data
();
if
(
broadcast_ndim
!=
c
->
ndim
||
!
getBroadcastShape
(
a
->
shape
,
a
->
ndim
,
b
->
shape
,
b
->
ndim
,
broadcast_shape
,
padded_shape1
,
padded_shape2
,
broadcast_ndim
))
{
if
(
broadcast_ndim
!=
c
->
ndim
||
!
getBroadcastShape
(
a
->
shape
,
a
->
ndim
,
b
->
shape
,
b
->
ndim
,
broadcast_shape
,
padded_shape1
,
padded_shape2
,
broadcast_ndim
))
{
return
false
;
}
return
std
::
equal
(
broadcast_shape
,
broadcast_shape
+
broadcast_ndim
,
...
...
@@ -126,7 +122,6 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
return
isValidBroadcastShape
(
a
,
b
,
c
,
std
::
max
(
a
->
ndim
,
b
->
ndim
));
}
// permute the dimensions of a tensor descriptor
inline
infiniopTensorDescriptor_t
permute
(
infiniopTensorDescriptor_t
desc
,
const
std
::
vector
<
size_t
>
&
order
)
{
...
...
@@ -149,10 +144,9 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
// check if the dimensions [dim_start, dim_end] of a tensor descriptor are
// contiguous
inline
bool
isContiguous
(
const
infiniopTensorDescriptor_t
&
desc
,
size_t
dim_start
,
size_t
dim_end
)
{
size_t
dim_start
,
size_t
dim_end
)
{
for
(
size_t
i
=
dim_start
+
1
;
i
<=
dim_end
;
i
++
)
{
if
(
desc
->
strides
[
i
-
1
]
!=
static_cast
<
int64_t
>
(
desc
->
shape
[
i
])
*
desc
->
strides
[
i
])
{
if
(
desc
->
strides
[
i
-
1
]
!=
static_cast
<
int64_t
>
(
desc
->
shape
[
i
])
*
desc
->
strides
[
i
])
{
return
false
;
}
}
...
...
@@ -168,7 +162,7 @@ inline bool isContiguous(const infiniopTensorDescriptor_t &desc) {
// merge the dimensions [dim_start, dim_end] of a tensor descriptor
inline
infiniopTensorDescriptor_t
dimMerge
(
infiniopTensorDescriptor_t
desc
,
size_t
dim_start
,
size_t
dim_end
)
{
size_t
dim_start
,
size_t
dim_end
)
{
size_t
ndim
=
desc
->
ndim
;
if
(
dim_start
>
dim_end
||
dim_end
>=
ndim
)
{
return
nullptr
;
...
...
@@ -203,11 +197,10 @@ inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,
// split the dimension dim of a tensor descriptor into multiple dimensions
inline
infiniopTensorDescriptor_t
dimSplit
(
infiniopTensorDescriptor_t
desc
,
size_t
dim
,
const
std
::
vector
<
size_t
>
&
dims
)
{
size_t
dim
,
const
std
::
vector
<
size_t
>
&
dims
)
{
size_t
ndim
=
desc
->
ndim
;
if
(
desc
->
shape
[
dim
]
!=
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
{}))
{
if
(
desc
->
shape
[
dim
]
!=
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
{}))
{
return
nullptr
;
}
size_t
new_ndim
=
ndim
+
dims
.
size
()
-
1
;
...
...
@@ -221,10 +214,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
}
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
new_shape
[
index
]
=
dims
[
i
];
new_strides
[
index
]
=
desc
->
strides
[
dim
]
*
desc
->
shape
[
dim
]
/
std
::
accumulate
(
dims
.
begin
(),
dims
.
begin
()
+
i
+
1
,
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
());
new_strides
[
index
]
=
desc
->
strides
[
dim
]
*
desc
->
shape
[
dim
]
/
std
::
accumulate
(
dims
.
begin
(),
dims
.
begin
()
+
i
+
1
,
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
());
index
++
;
}
for
(
size_t
i
=
dim
+
1
;
i
<
ndim
;
i
++
)
{
...
...
test/infiniop/__init__.py
View file @
3c31dc6c
import
libinfiniop
\ No newline at end of file
import
libinfiniop
test/infiniop/add.py
View file @
3c31dc6c
...
...
@@ -41,8 +41,8 @@ def test(
lib
,
handle
,
torch_device
,
c_shape
,
a_shape
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
...
...
@@ -56,13 +56,21 @@ def test(
a
=
torch
.
rand
(
a_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
b_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
c
=
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
c
=
(
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
ans
=
add
(
a
,
b
)
a_tensor
=
to_tensor
(
a
,
lib
)
b_tensor
=
to_tensor
(
b
,
lib
)
c_tensor
=
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
c_tensor
=
(
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
)
descriptor
=
infiniopAddDescriptor_t
()
check_error
(
...
...
@@ -91,8 +99,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -100,8 +110,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -111,13 +123,16 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
...
...
@@ -133,6 +148,7 @@ if __name__ == "__main__":
((
2
,
4
,
3
),
(
2
,
1
,
3
),
(
4
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
3
,
4
,
5
),
(
2
,
3
,
4
,
5
),
(
5
,),
Inplace
.
OUT_OF_PLACE
),
((
3
,
2
,
4
,
5
),
(
4
,
5
),
(
3
,
2
,
1
,
1
),
Inplace
.
OUT_OF_PLACE
),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/avg_pool.py
View file @
3c31dc6c
...
...
@@ -35,7 +35,7 @@ class AvgPoolDescriptor(Structure):
infiniopAvgPoolDescriptor_t
=
POINTER
(
AvgPoolDescriptor
)
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
pooling_layers
=
{
1
:
torch
.
nn
.
AvgPool1d
,
2
:
torch
.
nn
.
AvgPool2d
,
...
...
@@ -48,7 +48,9 @@ def pool(x, k, padding, stride, dilation = 1):
return
None
if
ndim
==
3
and
x
.
dtype
==
torch
.
float16
:
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
.
to
(
torch
.
float32
)).
to
(
torch
.
float16
)
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
.
to
(
torch
.
float32
)
).
to
(
torch
.
float16
)
else
:
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
)
if
PROFILE
:
...
...
@@ -69,18 +71,20 @@ def inferShape(x_shape, kernel_shape, padding, strides):
return
x_shape
[:
2
]
+
tuple
(
output_shape
)
# convert a python tuple to a ctype void pointer
def
tuple_to_void_p
(
py_tuple
:
Tuple
):
array
=
ctypes
.
c_int64
*
len
(
py_tuple
)
data_array
=
array
(
*
py_tuple
)
return
ctypes
.
cast
(
data_array
,
ctypes
.
c_void_p
)
def
test
(
lib
,
handle
,
torch_device
,
x_shape
,
k_shape
,
x_shape
,
k_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
,
...
...
@@ -90,7 +94,9 @@ def test(
)
x
=
torch
.
rand
(
x_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
ans
=
pool
(
x
,
k_shape
,
padding
,
strides
)
...
...
@@ -126,7 +132,9 @@ def test(
check_error
(
lib
.
infiniopGetAvgPoolWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
...
...
@@ -164,8 +172,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -173,8 +183,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -184,17 +196,21 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# x_shape, kernel_shape, padding, strides
((
1
,
1
,
10
),
(
3
,),
(
1
,),
(
1
,)),
((
32
,
3
,
224
,
224
),
(
3
,
3
),
(
1
,
1
),
(
2
,
2
)),
((
1
,
1
,
16
,
16
,
16
),
(
5
,
5
,
5
),
(
2
,
2
,
2
),
(
2
,
2
,
2
)),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/causal_softmax.py
View file @
3c31dc6c
...
...
@@ -101,6 +101,7 @@ def test_bang(lib, test_cases):
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
x_stride
)
destroy_handle
(
lib
,
handle
)
def
test_ascend
(
lib
,
test_cases
):
import
torch_npu
...
...
@@ -111,11 +112,12 @@ def test_ascend(lib, test_cases):
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# x_shape, x_stride
((
32
,
20
,
512
),
None
),
((
32
,
20
,
512
),
(
20480
,
512
,
1
)),
# Ascend 暂不支持非连续
((
32
,
20
,
512
),
(
20480
,
512
,
1
)),
# Ascend 暂不支持非连续
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/conv.py
View file @
3c31dc6c
...
...
@@ -41,17 +41,11 @@ infiniopConvDescriptor_t = POINTER(ConvDescriptor)
def
conv
(
x
,
w
,
stride
,
padding
,
dilation
):
match
len
(
x
.
shape
)
-
2
:
case
1
:
return
F
.
conv1d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
return
F
.
conv1d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
2
:
return
F
.
conv2d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
return
F
.
conv2d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
3
:
return
F
.
conv3d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
return
F
.
conv3d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
_
:
print
(
"Error: Pytorch -> Unsupported tensor dimension"
)
return
None
...
...
@@ -66,11 +60,15 @@ def inferShape(
dilations
:
List
[
int
],
)
->
Tuple
[
int
,
...]:
assert
(
len
(
x_shape
)
==
len
(
w_shape
)
==
len
(
pads
)
+
2
==
len
(
dilations
)
+
2
==
len
(
strides
)
+
2
len
(
x_shape
)
==
len
(
w_shape
)
==
len
(
pads
)
+
2
==
len
(
dilations
)
+
2
==
len
(
strides
)
+
2
),
"x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
output_dims
=
[
math
.
floor
(
(
x_shape
[
i
+
2
]
+
2
*
pads
[
i
]
-
dilations
[
i
]
*
(
w_shape
[
i
+
2
]
-
1
)
-
1
)
(
x_shape
[
i
+
2
]
+
2
*
pads
[
i
]
-
dilations
[
i
]
*
(
w_shape
[
i
+
2
]
-
1
)
-
1
)
/
strides
[
i
]
+
1
)
...
...
@@ -145,7 +143,9 @@ def test(
check_error
(
lib
.
infiniopGetConvWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
...
...
@@ -177,7 +177,7 @@ def test(
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
print
(
f
" lib time:
{
elapsed
:
6
f
}
"
)
if
(
tensor_dtype
==
torch
.
float16
)
:
if
tensor_dtype
==
torch
.
float16
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-2
)
else
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-3
)
...
...
@@ -188,8 +188,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -197,8 +199,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -208,8 +212,10 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/expand.py
View file @
3c31dc6c
...
...
@@ -47,10 +47,10 @@ def test(
lib
,
handle
,
torch_device
,
y_shape
,
y_shape
,
x_shape
,
y_stride
=
None
,
x_stride
=
None
,
y_stride
=
None
,
x_stride
=
None
,
tensor_dtype
=
torch
.
float16
,
):
print
(
...
...
@@ -109,8 +109,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -118,8 +120,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -129,13 +133,16 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# y_shape, x_shape, y_stride, x_stride
((),
(),
None
,
None
),
((
3
,
3
),
(
1
,),
None
,
None
),
...
...
@@ -146,6 +153,7 @@ if __name__ == "__main__":
((
2
,
3
,
4
,
5
),
(
5
,),
None
,
None
),
((
3
,
2
,
4
,
5
),
(
3
,
2
,
1
,
1
),
None
,
None
),
((
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
1
),
None
,
None
),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/gemm.py
View file @
3c31dc6c
...
...
@@ -27,6 +27,7 @@ PROFILE = False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
class
GEMMDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
...
...
@@ -34,10 +35,15 @@ class GEMMDescriptor(Structure):
infiniopGEMMDescriptor_t
=
POINTER
(
GEMMDescriptor
)
def
gemm
(
A
,
B
,
C
=
None
,
transA
=
False
,
transB
=
False
,
alpha
=
1.0
,
beta
=
0.0
,
dtype
=
torch
.
float32
):
def
gemm
(
A
,
B
,
C
=
None
,
transA
=
False
,
transB
=
False
,
alpha
=
1.0
,
beta
=
0.0
,
dtype
=
torch
.
float32
):
A
=
A
.
T
if
transA
else
A
B
=
B
.
T
if
transB
else
B
result
=
alpha
*
torch
.
matmul
(
A
if
dtype
!=
torch
.
float16
else
A
.
to
(
torch
.
float32
),
B
if
dtype
!=
torch
.
float16
else
B
.
to
(
torch
.
float32
)).
to
(
dtype
)
result
=
alpha
*
torch
.
matmul
(
A
if
dtype
!=
torch
.
float16
else
A
.
to
(
torch
.
float32
),
B
if
dtype
!=
torch
.
float16
else
B
.
to
(
torch
.
float32
),
).
to
(
dtype
)
if
C
is
not
None
:
result
+=
beta
*
C
if
dtype
!=
torch
.
float16
else
C
.
to
(
torch
.
float32
)
if
PROFILE
:
...
...
@@ -64,7 +70,7 @@ def test(
dtype
=
torch
.
float16
,
):
print
(
f
"Testing GEMM on
{
torch_device
}
with transA:
{
transA
}
transB:
{
transB
}
"
f
"Testing GEMM on
{
torch_device
}
with transA:
{
transA
}
transB:
{
transB
}
"
f
"a_shape:
{
a_shape
}
b_shape:
{
b_shape
}
c_shape:
{
c_shape
}
y_shape:
{
y_shape
}
"
f
"a_stride:
{
a_stride
}
b_stride:
{
b_stride
}
c_stride:
{
c_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
)
...
...
@@ -121,9 +127,7 @@ def test(
workspace_size
=
ctypes
.
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetGEMMWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
lib
.
infiniopGetGEMMWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
=
torch
.
zeros
(
int
(
workspace_size
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
...
...
@@ -182,8 +186,10 @@ def test_cpu(lib, test_cases):
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -204,8 +210,10 @@ def test_cuda(lib, test_cases):
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -229,9 +237,10 @@ def test_bang(lib, test_cases):
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/global_avg_pool.py
View file @
3c31dc6c
...
...
@@ -99,7 +99,12 @@ def test(
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
check_error
(
lib
.
infiniopGlobalAvgPool
(
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
None
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
None
,
)
)
if
PROFILE
:
...
...
test/infiniop/libinfiniop/__init__.py
View file @
3c31dc6c
import
os
import
sys
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'.'
)))
from
.liboperators
import
open_lib
,
CTensor
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"."
)))
from
.liboperators
import
(
open_lib
,
CTensor
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
)
from
.devices
import
*
from
.utils
import
*
from
.datatypes
import
*
test/infiniop/libinfiniop/datatypes.py
View file @
3c31dc6c
...
...
@@ -7,7 +7,7 @@ class InfiniDtype:
I32
=
5
I64
=
6
U8
=
7
U16
=
8
U16
=
8
U32
=
9
U64
=
10
F8
=
11
...
...
test/infiniop/libinfiniop/utils.py
View file @
3c31dc6c
...
...
@@ -54,6 +54,7 @@ def create_workspace(size, torch_device):
if
size
==
0
:
return
None
import
torch
return
torch
.
zeros
(
size
=
(
size
,),
dtype
=
torch
.
uint8
,
device
=
torch_device
)
...
...
@@ -172,6 +173,7 @@ def get_args():
def
synchronize_device
(
torch_device
):
import
torch
if
torch_device
==
"cuda"
:
torch
.
cuda
.
synchronize
()
elif
torch_device
==
"npu"
:
...
...
@@ -197,13 +199,24 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
If True, the function will print detailed information about any discrepancies between the tensors.
"""
import
numpy
as
np
print_discrepancy
(
actual
,
desired
,
atol
,
rtol
,
verbose
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
def
debug_all
(
actual_vals
:
Sequence
,
desired_vals
:
Sequence
,
condition
:
str
,
atol
=
0
,
rtol
=
1e-2
,
equal_nan
=
False
,
verbose
=
True
):
def
debug_all
(
actual_vals
:
Sequence
,
desired_vals
:
Sequence
,
condition
:
str
,
atol
=
0
,
rtol
=
1e-2
,
equal_nan
=
False
,
verbose
=
True
,
):
"""
Debugging function to compare two sequences of values (actual and desired) pair by pair, results
Debugging function to compare two sequences of values (actual and desired) pair by pair, results
are linked by the given logical condition, and prints discrepancies
Arguments:
----------
...
...
@@ -223,7 +236,10 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
- AssertionError: If the specified `condition` is not 'or' or 'and'.
"""
assert
len
(
actual_vals
)
==
len
(
desired_vals
),
"Invalid Length"
assert
condition
in
{
"or"
,
"and"
},
"Invalid condition: should be either 'or' or 'and'"
assert
condition
in
{
"or"
,
"and"
,
},
"Invalid condition: should be either 'or' or 'and'"
import
numpy
as
np
passed
=
False
if
condition
==
"or"
else
True
...
...
@@ -237,14 +253,22 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
elif
condition
==
"and"
:
if
passed
and
len
(
indices
)
!=
0
:
passed
=
False
print
(
f
"
\033
[31mThe condition has not been satisfied: Condition #
{
index
+
1
}
\033
[0m"
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
print
(
f
"
\033
[31mThe condition has not been satisfied: Condition #
{
index
+
1
}
\033
[0m"
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
,
)
assert
passed
,
"
\033
[31mThe condition has not been satisfied
\033
[0m"
def
print_discrepancy
(
actual
,
expected
,
atol
=
0
,
rtol
=
1e-3
,
verbose
=
True
):
def
print_discrepancy
(
actual
,
expected
,
atol
=
0
,
rtol
=
1e-3
,
verbose
=
True
):
if
actual
.
shape
!=
expected
.
shape
:
raise
ValueError
(
"Tensors must have the same shape to compare."
)
...
...
@@ -273,7 +297,9 @@ def print_discrepancy(
for
idx
in
diff_indices
:
index_tuple
=
tuple
(
idx
.
tolist
())
actual_str
=
f
"
{
actual
[
index_tuple
]:
<
{
col_width
[
1
]
}
.
{
decimal_places
[
1
]
}
f
}
"
expected_str
=
f
"
{
expected
[
index_tuple
]:
<
{
col_width
[
2
]
}
.
{
decimal_places
[
2
]
}
f
}
"
expected_str
=
(
f
"
{
expected
[
index_tuple
]:
<
{
col_width
[
2
]
}
.
{
decimal_places
[
2
]
}
f
}
"
)
delta_str
=
f
"
{
delta
[
index_tuple
]:
<
{
col_width
[
3
]
}
.
{
decimal_places
[
3
]
}
f
}
"
print
(
f
" > Index:
{
str
(
index_tuple
):
<
{
col_width
[
0
]
}}
"
...
...
@@ -287,10 +313,18 @@ def print_discrepancy(
print
(
f
" - Desired dtype:
{
expected
.
dtype
}
"
)
print
(
f
" - Atol:
{
atol
}
"
)
print
(
f
" - Rtol:
{
rtol
}
"
)
print
(
f
" - Mismatched elements:
{
len
(
diff_indices
)
}
/
{
actual
.
numel
()
}
(
{
len
(
diff_indices
)
/
actual
.
numel
()
*
100
}
%)"
)
print
(
f
" - Min(actual) :
{
torch
.
min
(
actual
):
<
{
col_width
[
1
]
}}
| Max(actual) :
{
torch
.
max
(
actual
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(desired):
{
torch
.
min
(
expected
):
<
{
col_width
[
1
]
}}
| Max(desired):
{
torch
.
max
(
expected
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(delta) :
{
torch
.
min
(
delta
):
<
{
col_width
[
1
]
}}
| Max(delta) :
{
torch
.
max
(
delta
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Mismatched elements:
{
len
(
diff_indices
)
}
/
{
actual
.
numel
()
}
(
{
len
(
diff_indices
)
/
actual
.
numel
()
*
100
}
%)"
)
print
(
f
" - Min(actual) :
{
torch
.
min
(
actual
):
<
{
col_width
[
1
]
}}
| Max(actual) :
{
torch
.
max
(
actual
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(desired):
{
torch
.
min
(
expected
):
<
{
col_width
[
1
]
}}
| Max(desired):
{
torch
.
max
(
expected
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(delta) :
{
torch
.
min
(
delta
):
<
{
col_width
[
1
]
}}
| Max(delta) :
{
torch
.
max
(
delta
):
<
{
col_width
[
2
]
}}
"
)
print
(
"-"
*
total_width
+
"
\n
"
)
return
diff_indices
...
...
@@ -298,14 +332,17 @@ def print_discrepancy(
def
get_tolerance
(
tolerance_map
,
tensor_dtype
,
default_atol
=
0
,
default_rtol
=
1e-3
):
"""
Returns the atol and rtol for a given tensor data type in the tolerance_map.
Returns the atol and rtol for a given tensor data type in the tolerance_map.
If the given data type is not found, it returns the provided default tolerance values.
"""
return
tolerance_map
.
get
(
tensor_dtype
,
{
'atol'
:
default_atol
,
'rtol'
:
default_rtol
}).
values
()
return
tolerance_map
.
get
(
tensor_dtype
,
{
"atol"
:
default_atol
,
"rtol"
:
default_rtol
}
).
values
()
def
timed_op
(
func
,
num_iterations
,
device
):
import
time
""" Function for timing operations with synchronization. """
synchronize_device
(
device
)
start
=
time
.
time
()
...
...
@@ -318,7 +355,7 @@ def timed_op(func, num_iterations, device):
def
profile_operation
(
desc
,
func
,
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
):
"""
Unified profiling workflow that is used to profile the execution time of a given function.
It first performs a number of warmup runs, then performs timed execution and
It first performs a number of warmup runs, then performs timed execution and
prints the average execution time.
Arguments:
...
...
@@ -328,11 +365,11 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
- torch_device (str): The device on which the operation runs, provided for timed execution.
- NUM_PRERUN (int): The number of warmup runs.
- NUM_ITERATIONS (int): The number of timed execution iterations, used to calculate the average execution time.
"""
"""
# Warmup runs
for
_
in
range
(
NUM_PRERUN
):
func
()
# Timed execution
elapsed
=
timed_op
(
lambda
:
func
(),
NUM_ITERATIONS
,
torch_device
)
print
(
f
"
{
desc
}
time:
{
elapsed
*
1000
:
6
f
}
ms"
)
...
...
@@ -347,7 +384,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
- lib (ctypes.CDLL): The library object containing the operator implementations.
- device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py.
- test_func (function): The test function to be executed for each test case.
- test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
- test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
to be passed to `test_func`.
- tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
"""
...
...
@@ -355,7 +392,13 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
try
:
for
test_case
in
test_cases
:
for
tensor_dtype
in
tensor_dtypes
:
test_func
(
lib
,
handle
,
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
)
test_func
(
lib
,
handle
,
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
,
)
finally
:
destroy_handle
(
lib
,
handle
)
...
...
@@ -365,22 +408,26 @@ def get_test_devices(args):
Using the given parsed Namespace to determine the devices to be tested.
Argument:
- args: the parsed Namespace object.
- args: the parsed Namespace object.
Return:
- devices_to_test: the devices that will be tested. Default is CPU.
"""
devices_to_test
=
[]
if
args
.
cpu
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
CPU
)
if
args
.
nvidia
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
NVIDIA
)
if
args
.
cambricon
:
if
args
.
cpu
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
CPU
)
if
args
.
nvidia
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
NVIDIA
)
if
args
.
cambricon
:
import
torch_mlu
devices_to_test
.
append
(
InfiniDeviceEnum
.
CAMBRICON
)
if
args
.
ascend
:
if
args
.
ascend
:
import
torch
import
torch_npu
torch
.
npu
.
set_device
(
0
)
# Ascend NPU needs explicit device initialization
torch
.
npu
.
set_device
(
0
)
# Ascend NPU needs explicit device initialization
devices_to_test
.
append
(
InfiniDeviceEnum
.
ASCEND
)
if
not
devices_to_test
:
devices_to_test
=
[
InfiniDeviceEnum
.
CPU
]
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment