Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
3c31dc6c
Unverified
Commit
3c31dc6c
authored
Feb 18, 2025
by
PanZezhong1725
Committed by
GitHub
Feb 18, 2025
Browse files
Merge pull request #45 from YdrMaster/main
issue/52 代码格式化:机制和效果
parents
16dad776
e5ed9fa1
Changes
48
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
424 additions
and
330 deletions
+424
-330
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+5
-14
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+1
-1
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+3
-3
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
+0
-2
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+2
-2
src/infiniop/ops/random_sample/operator.cc
src/infiniop/ops/random_sample/operator.cc
+68
-68
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+57
-57
src/infiniop/ops/rms_norm/operator.cc
src/infiniop/ops/rms_norm/operator.cc
+80
-80
src/infiniop/ops/utils.h
src/infiniop/ops/utils.h
+27
-37
test/infiniop/__init__.py
test/infiniop/__init__.py
+1
-1
test/infiniop/add.py
test/infiniop/add.py
+20
-4
test/infiniop/avg_pool.py
test/infiniop/avg_pool.py
+22
-6
test/infiniop/causal_softmax.py
test/infiniop/causal_softmax.py
+3
-1
test/infiniop/conv.py
test/infiniop/conv.py
+19
-13
test/infiniop/expand.py
test/infiniop/expand.py
+11
-3
test/infiniop/gemm.py
test/infiniop/gemm.py
+16
-7
test/infiniop/global_avg_pool.py
test/infiniop/global_avg_pool.py
+6
-1
test/infiniop/libinfiniop/__init__.py
test/infiniop/libinfiniop/__init__.py
+8
-2
test/infiniop/libinfiniop/datatypes.py
test/infiniop/libinfiniop/datatypes.py
+1
-1
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+74
-27
No files found.
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
View file @
3c31dc6c
...
...
@@ -49,20 +49,11 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
for
(
size_t
i
=
0
;
i
<
info
.
batch
;
++
i
)
{
for
(
size_t
m_
=
0
;
m_
<
info
.
m
;
++
m_
)
{
for
(
size_t
n_
=
0
;
n_
<
info
.
n
;
++
n_
)
{
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
float
sum
=
0
;
for
(
size_t
k_
=
0
;
k_
<
info
.
k
;
++
k_
)
{
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
i
*
info
.
a_matrix
.
stride
+
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
i
*
info
.
a_matrix
.
stride
+
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
if
constexpr
(
std
::
is_same
<
Tdata
,
uint16_t
>::
value
)
{
sum
+=
f16_to_f32
(
*
a_
)
*
f16_to_f32
(
*
b_
);
}
else
{
...
...
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
View file @
3c31dc6c
#include "./matmul_cuda.cuh"
#include "../../utils.h"
#include "./matmul_cuda.cuh"
infiniopStatus_t
cudaCreateMatmulDescriptor
(
infiniopCudaHandle_t
handle
,
infiniopMatmulCudaDescriptor_t
*
desc_ptr
,
...
...
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
View file @
3c31dc6c
#ifndef __INFINIOP_MATMUL_CUDA_H__
#define __INFINIOP_MATMUL_CUDA_H__
#include "matmul_cuda_api.h"
#include "../../../devices/cuda/common_cuda.cuh"
#include <memory>
#include "../blas.h"
#include "matmul_cuda_api.h"
#include <memory>
typedef
struct
InfiniopMatmulCudaDescriptor
{
infiniDevice_t
device
;
...
...
@@ -14,4 +14,4 @@ typedef struct InfiniopMatmulCudaDescriptor {
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
cublas_handle_pool
;
}
InfiniopMatmulCudaDescriptor
;
#endif// __INFINIOP_MATMUL_CUDA_H__
#endif
// __INFINIOP_MATMUL_CUDA_H__
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
View file @
3c31dc6c
...
...
@@ -4,7 +4,6 @@
#include "../../../devices/cuda/cuda_handle.h"
#include "infiniop/operator.h"
struct
InfiniopMatmulCudaDescriptor
;
typedef
struct
InfiniopMatmulCudaDescriptor
*
infiniopMatmulCudaDescriptor_t
;
...
...
@@ -28,5 +27,4 @@ infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
infiniopStatus_t
cudaDestroyMatmulDescriptor
(
infiniopMatmulCudaDescriptor_t
desc
);
#endif // __INFINIOP_MATMUL_CUDA_API_H__
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
View file @
3c31dc6c
#include "../../utils.h"
#include "./matmul_cuda.cuh"
template
<
typename
Tdata
>
template
<
typename
Tdata
>
infiniopStatus_t
cudaMatmulCublas
(
infiniopMatmulCudaDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
void
*
stream
)
{
auto
info
=
desc
->
info
;
...
...
@@ -26,7 +26,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c,
auto
op_a
=
info
.
a_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_b
=
info
.
b_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
use_cublas
(
desc
->
cublas_handle_pool
,
desc
->
device_id
,
(
cudaStream_t
)
stream
,
use_cublas
(
desc
->
cublas_handle_pool
,
desc
->
device_id
,
(
cudaStream_t
)
stream
,
[
&
](
cublasHandle_t
handle
)
{
cublasGemmStridedBatchedEx
(
handle
,
op_a
,
...
...
src/infiniop/ops/random_sample/operator.cc
View file @
3c31dc6c
...
...
@@ -4,35 +4,35 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuCreateRandomSampleDescriptor
(
handle
,
(
RandomSampleCpuDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
return
cpuCreateRandomSampleDescriptor
(
handle
,
(
RandomSampleCpuDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
return
cudaCreateRandomSampleDescriptor
((
CudaHandle_t
)
handle
,
(
RandomSampleCudaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
return
cudaCreateRandomSampleDescriptor
((
CudaHandle_t
)
handle
,
(
RandomSampleCudaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangCreateRandomSampleDescriptor
((
BangHandle_t
)
handle
,
(
RandomSampleBangDescriptor_t
*
)
desc_ptr
,
result
,
return
bangCreateRandomSampleDescriptor
((
BangHandle_t
)
handle
,
(
RandomSampleBangDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendCreateRandomSampleDescriptor
((
AscendHandle_t
)
handle
,
(
RandomSampleAscendDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
return
ascendCreateRandomSampleDescriptor
((
AscendHandle_t
)
handle
,
(
RandomSampleAscendDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateRandomSampleDescriptor
((
MacaHandle_t
)
handle
,
(
RandomSampleMacaDescriptor_t
*
)
desc_ptr
,
result
,
return
macaCreateRandomSampleDescriptor
((
MacaHandle_t
)
handle
,
(
RandomSampleMacaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaCreateRandomSampleDescriptor
((
MusaHandle_t
)
handle
,
(
RandomSampleMusaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
return
musaCreateRandomSampleDescriptor
((
MusaHandle_t
)
handle
,
(
RandomSampleMusaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -42,33 +42,33 @@ __C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDe
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuGetRandomSampleWorkspaceSize
((
RandomSampleCpuDescriptor_t
)
desc
,
size
);
return
cpuGetRandomSampleWorkspaceSize
((
RandomSampleCpuDescriptor_t
)
desc
,
size
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaGetRandomSampleWorkspaceSize
((
RandomSampleCudaDescriptor_t
)
desc
,
size
);
return
cudaGetRandomSampleWorkspaceSize
((
RandomSampleCudaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangGetRandomSampleWorkspaceSize
((
RandomSampleBangDescriptor_t
)
desc
,
size
);
return
bangGetRandomSampleWorkspaceSize
((
RandomSampleBangDescriptor_t
)
desc
,
size
);
// return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendGetRandomSampleWorkspaceSize
((
RandomSampleAscendDescriptor_t
)
desc
,
size
);
return
ascendGetRandomSampleWorkspaceSize
((
RandomSampleAscendDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaGetRandomSampleWorkspaceSize
((
RandomSampleMacaDescriptor_t
)
desc
,
size
);
return
macaGetRandomSampleWorkspaceSize
((
RandomSampleMacaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaGetRandomSampleWorkspaceSize
((
RandomSampleMusaDescriptor_t
)
desc
,
size
);
return
musaGetRandomSampleWorkspaceSize
((
RandomSampleMusaDescriptor_t
)
desc
,
size
);
}
#endif
}
...
...
@@ -88,30 +88,30 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuRandomSample
((
RandomSampleCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
cpuRandomSample
((
RandomSampleCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
return
cudaRandomSample
((
RandomSampleCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
cudaRandomSample
((
RandomSampleCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangRandomSample
((
RandomSampleBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
bangRandomSample
((
RandomSampleBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendRandomSample
((
RandomSampleAscendDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
ascendRandomSample
((
RandomSampleAscendDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaRandomSample
((
RandomSampleMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
macaRandomSample
((
RandomSampleMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaRandomSample
((
RandomSampleMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
musaRandomSample
((
RandomSampleMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -121,30 +121,30 @@ __C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleD
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuDestroyRandomSampleDescriptor
((
RandomSampleCpuDescriptor_t
)
desc
);
return
cpuDestroyRandomSampleDescriptor
((
RandomSampleCpuDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
return
cudaDestroyRandomSampleDescriptor
((
RandomSampleCudaDescriptor_t
)
desc
);
return
cudaDestroyRandomSampleDescriptor
((
RandomSampleCudaDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangDestroyRandomSampleDescriptor
((
RandomSampleBangDescriptor_t
)
desc
);
return
bangDestroyRandomSampleDescriptor
((
RandomSampleBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendDestroyRandomSampleDescriptor
((
RandomSampleAscendDescriptor_t
)
desc
);
return
ascendDestroyRandomSampleDescriptor
((
RandomSampleAscendDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaDestroyRandomSampleDescriptor
((
RandomSampleMacaDescriptor_t
)
desc
);
return
macaDestroyRandomSampleDescriptor
((
RandomSampleMacaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
return
musaDestroyRandomSampleDescriptor
((
RandomSampleMusaDescriptor_t
)
desc
);
return
musaDestroyRandomSampleDescriptor
((
RandomSampleMusaDescriptor_t
)
desc
);
#endif
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/rearrange/operator.cc
View file @
3c31dc6c
...
...
@@ -8,35 +8,35 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuCreateRearrangeDescriptor
(
handle
,
(
RearrangeCpuDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
cpuCreateRearrangeDescriptor
(
handle
,
(
RearrangeCpuDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaCreateRearrangeDescriptor
((
CudaHandle_t
)
handle
,
(
RearrangeCudaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
cudaCreateRearrangeDescriptor
((
CudaHandle_t
)
handle
,
(
RearrangeCudaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangCreateRearrangeDescriptor
((
BangHandle_t
)
handle
,
(
RearrangeBangDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
bangCreateRearrangeDescriptor
((
BangHandle_t
)
handle
,
(
RearrangeBangDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnCreateRearrangeDescriptor
((
AscendHandle_t
)
handle
,
(
RearrangeAclnnDescriptor_t
*
)
desc_ptr
,
return
aclnnCreateRearrangeDescriptor
((
AscendHandle_t
)
handle
,
(
RearrangeAclnnDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateRearrangeDescriptor
((
MacaHandle_t
)
handle
,
(
RearrangeMacaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
macaCreateRearrangeDescriptor
((
MacaHandle_t
)
handle
,
(
RearrangeMacaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaCreateRearrangeDescriptor
((
MusaHandle_t
)
handle
,
(
RearrangeMusaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
musaCreateRearrangeDescriptor
((
MusaHandle_t
)
handle
,
(
RearrangeMusaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
#endif
}
...
...
@@ -47,22 +47,22 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuRearrange
((
RearrangeCpuDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
cpuRearrange
((
RearrangeCpuDescriptor_t
)
desc
,
dst
,
src
,
stream
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaRearrange
((
RearrangeCudaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
cudaRearrange
((
RearrangeCudaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangRearrange
((
RearrangeBangDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
bangRearrange
((
RearrangeBangDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnRearrange
((
RearrangeAclnnDescriptor_t
)
desc
,
return
aclnnRearrange
((
RearrangeAclnnDescriptor_t
)
desc
,
dst
,
src
,
stream
);
...
...
@@ -70,12 +70,12 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaRearrange
((
RearrangeMacaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
macaRearrange
((
RearrangeMacaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaRearrange
((
RearrangeMusaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
musaRearrange
((
RearrangeMusaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
#endif
}
...
...
@@ -86,32 +86,32 @@ __C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescrip
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuDestroyRearrangeDescriptor
((
RearrangeCpuDescriptor_t
)
desc
);
return
cpuDestroyRearrangeDescriptor
((
RearrangeCpuDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaDestroyRearrangeDescriptor
((
RearrangeCudaDescriptor_t
)
desc
);
return
cudaDestroyRearrangeDescriptor
((
RearrangeCudaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangDestroyRearrangeDescriptor
((
RearrangeBangDescriptor_t
)
desc
);
return
bangDestroyRearrangeDescriptor
((
RearrangeBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnDestroyRearrangeDescriptor
((
RearrangeAclnnDescriptor_t
)
desc
);
return
aclnnDestroyRearrangeDescriptor
((
RearrangeAclnnDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaDestroyRearrangeDescriptor
((
RearrangeMacaDescriptor_t
)
desc
);
return
macaDestroyRearrangeDescriptor
((
RearrangeMacaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaDestroyRearrangeDescriptor
((
RearrangeMusaDescriptor_t
)
desc
);
return
musaDestroyRearrangeDescriptor
((
RearrangeMusaDescriptor_t
)
desc
);
}
#endif
}
...
...
src/infiniop/ops/rms_norm/operator.cc
View file @
3c31dc6c
...
...
@@ -10,22 +10,22 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuCreateRMSNormDescriptor
(
handle
,
(
RMSNormCpuDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
cpuCreateRMSNormDescriptor
(
handle
,
(
RMSNormCpuDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaCreateRMSNormDescriptor
((
CudaHandle_t
)
handle
,
(
RMSNormCudaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
cudaCreateRMSNormDescriptor
((
CudaHandle_t
)
handle
,
(
RMSNormCudaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangCreateRMSNormDescriptor
((
BangHandle_t
)
handle
,
(
RMSNormBangDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
bangCreateRMSNormDescriptor
((
BangHandle_t
)
handle
,
(
RMSNormBangDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnCreateRMSNormDescriptor
((
AscendHandle_t
)
handle
,
(
RMSNormAclnnDescriptor_t
*
)
desc_ptr
,
return
aclnnCreateRMSNormDescriptor
((
AscendHandle_t
)
handle
,
(
RMSNormAclnnDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
...
...
@@ -34,12 +34,12 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateRMSNormDescriptor
((
MacaHandle_t
)
handle
,
(
RMSNormMacaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
macaCreateRMSNormDescriptor
((
MacaHandle_t
)
handle
,
(
RMSNormMacaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaCreateRMSNormDescriptor
((
MusaHandle_t
)
handle
,
(
RMSNormMusaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
musaCreateRMSNormDescriptor
((
MusaHandle_t
)
handle
,
(
RMSNormMusaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
#endif
}
...
...
@@ -50,33 +50,33 @@ __C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuGetRMSNormWorkspaceSize
((
RMSNormCpuDescriptor_t
)
desc
,
size
);
return
cpuGetRMSNormWorkspaceSize
((
RMSNormCpuDescriptor_t
)
desc
,
size
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaGetRMSNormWorkspaceSize
((
RMSNormCudaDescriptor_t
)
desc
,
size
);
return
cudaGetRMSNormWorkspaceSize
((
RMSNormCudaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangGetRMSNormWorkspaceSize
((
RMSNormBangDescriptor_t
)
desc
,
size
);
return
bangGetRMSNormWorkspaceSize
((
RMSNormBangDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnGetRMSNormWorkspaceSize
((
RMSNormAclnnDescriptor_t
)
desc
,
return
aclnnGetRMSNormWorkspaceSize
((
RMSNormAclnnDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaGetRMSNormWorkspaceSize
((
RMSNormMacaDescriptor_t
)
desc
,
size
);
return
macaGetRMSNormWorkspaceSize
((
RMSNormMacaDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaGetRMSNormWorkspaceSize
((
RMSNormMusaDescriptor_t
)
desc
,
size
);
return
musaGetRMSNormWorkspaceSize
((
RMSNormMusaDescriptor_t
)
desc
,
size
);
}
#endif
}
...
...
@@ -88,22 +88,22 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuRMSNorm
((
RMSNormCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
cpuRMSNorm
((
RMSNormCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaRMSNorm
((
RMSNormCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
cudaRMSNorm
((
RMSNormCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangRMSNorm
((
RMSNormBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
bangRMSNorm
((
RMSNormBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnRMSNorm
((
RMSNormAclnnDescriptor_t
)
desc
,
return
aclnnRMSNorm
((
RMSNormAclnnDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
...
...
@@ -114,12 +114,12 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaRMSNorm
((
RMSNormMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
macaRMSNorm
((
RMSNormMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaRMSNorm
((
RMSNormMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
musaRMSNorm
((
RMSNormMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
#endif
}
...
...
@@ -130,32 +130,32 @@ __C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
return
cpuDestroyRMSNormDescriptor
((
RMSNormCpuDescriptor_t
)
desc
);
return
cpuDestroyRMSNormDescriptor
((
RMSNormCpuDescriptor_t
)
desc
);
#endif
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
return
cudaDestroyRMSNormDescriptor
((
RMSNormCudaDescriptor_t
)
desc
);
return
cudaDestroyRMSNormDescriptor
((
RMSNormCudaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
return
bangDestroyRMSNormDescriptor
((
RMSNormBangDescriptor_t
)
desc
);
return
bangDestroyRMSNormDescriptor
((
RMSNormBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
aclnnDestroyRMSNormDescriptor
((
RMSNormAclnnDescriptor_t
)
desc
);
return
aclnnDestroyRMSNormDescriptor
((
RMSNormAclnnDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaDestroyRMSNormDescriptor
((
RMSNormMacaDescriptor_t
)
desc
);
return
macaDestroyRMSNormDescriptor
((
RMSNormMacaDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaDestroyRMSNormDescriptor
((
RMSNormMusaDescriptor_t
)
desc
);
return
musaDestroyRMSNormDescriptor
((
RMSNormMusaDescriptor_t
)
desc
);
}
#endif
}
...
...
src/infiniop/ops/utils.h
View file @
3c31dc6c
...
...
@@ -67,8 +67,7 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
// compute broadcasted shape
for
(
size_t
i
=
0
;
i
<
max_rank
;
++
i
)
{
if
(
padded_shape1
[
i
]
==
padded_shape2
[
i
]
||
padded_shape1
[
i
]
==
1
||
padded_shape2
[
i
]
==
1
)
{
if
(
padded_shape1
[
i
]
==
padded_shape2
[
i
]
||
padded_shape1
[
i
]
==
1
||
padded_shape2
[
i
]
==
1
)
{
broadcast_shape
[
i
]
=
std
::
max
(
padded_shape1
[
i
],
padded_shape2
[
i
]);
}
else
{
return
false
;
...
...
@@ -89,10 +88,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
auto
broadcast_shape
=
broadcast_shape_
.
data
(),
padded_shape1
=
padded_shape1_
.
data
(),
padded_shape2
=
padded_shape2_
.
data
();
if
(
broadcast_ndim
!=
c
->
ndim
||
!
getBroadcastShape
(
a
->
shape
,
a
->
ndim
,
b
->
shape
,
b
->
ndim
,
broadcast_shape
,
padded_shape1
,
padded_shape2
,
broadcast_ndim
))
{
if
(
broadcast_ndim
!=
c
->
ndim
||
!
getBroadcastShape
(
a
->
shape
,
a
->
ndim
,
b
->
shape
,
b
->
ndim
,
broadcast_shape
,
padded_shape1
,
padded_shape2
,
broadcast_ndim
))
{
return
false
;
}
return
std
::
equal
(
broadcast_shape
,
broadcast_shape
+
broadcast_ndim
,
...
...
@@ -126,7 +122,6 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
return
isValidBroadcastShape
(
a
,
b
,
c
,
std
::
max
(
a
->
ndim
,
b
->
ndim
));
}
// permute the dimensions of a tensor descriptor
inline
infiniopTensorDescriptor_t
permute
(
infiniopTensorDescriptor_t
desc
,
const
std
::
vector
<
size_t
>
&
order
)
{
...
...
@@ -151,8 +146,7 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
inline
bool
isContiguous
(
const
infiniopTensorDescriptor_t
&
desc
,
size_t
dim_start
,
size_t
dim_end
)
{
for
(
size_t
i
=
dim_start
+
1
;
i
<=
dim_end
;
i
++
)
{
if
(
desc
->
strides
[
i
-
1
]
!=
static_cast
<
int64_t
>
(
desc
->
shape
[
i
])
*
desc
->
strides
[
i
])
{
if
(
desc
->
strides
[
i
-
1
]
!=
static_cast
<
int64_t
>
(
desc
->
shape
[
i
])
*
desc
->
strides
[
i
])
{
return
false
;
}
}
...
...
@@ -206,8 +200,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
size_t
dim
,
const
std
::
vector
<
size_t
>
&
dims
)
{
size_t
ndim
=
desc
->
ndim
;
if
(
desc
->
shape
[
dim
]
!=
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
{}))
{
if
(
desc
->
shape
[
dim
]
!=
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
{}))
{
return
nullptr
;
}
size_t
new_ndim
=
ndim
+
dims
.
size
()
-
1
;
...
...
@@ -221,10 +214,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
}
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
new_shape
[
index
]
=
dims
[
i
];
new_strides
[
index
]
=
desc
->
strides
[
dim
]
*
desc
->
shape
[
dim
]
/
std
::
accumulate
(
dims
.
begin
(),
dims
.
begin
()
+
i
+
1
,
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
());
new_strides
[
index
]
=
desc
->
strides
[
dim
]
*
desc
->
shape
[
dim
]
/
std
::
accumulate
(
dims
.
begin
(),
dims
.
begin
()
+
i
+
1
,
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
());
index
++
;
}
for
(
size_t
i
=
dim
+
1
;
i
<
ndim
;
i
++
)
{
...
...
test/infiniop/__init__.py
View file @
3c31dc6c
test/infiniop/add.py
View file @
3c31dc6c
...
...
@@ -56,13 +56,21 @@ def test(
a
=
torch
.
rand
(
a_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
b_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
c
=
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
c
=
(
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
ans
=
add
(
a
,
b
)
a_tensor
=
to_tensor
(
a
,
lib
)
b_tensor
=
to_tensor
(
b
,
lib
)
c_tensor
=
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
c_tensor
=
(
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
)
descriptor
=
infiniopAddDescriptor_t
()
check_error
(
...
...
@@ -91,8 +99,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -100,8 +110,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -111,13 +123,16 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
...
...
@@ -133,6 +148,7 @@ if __name__ == "__main__":
((
2
,
4
,
3
),
(
2
,
1
,
3
),
(
4
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
3
,
4
,
5
),
(
2
,
3
,
4
,
5
),
(
5
,),
Inplace
.
OUT_OF_PLACE
),
((
3
,
2
,
4
,
5
),
(
4
,
5
),
(
3
,
2
,
1
,
1
),
Inplace
.
OUT_OF_PLACE
),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/avg_pool.py
View file @
3c31dc6c
...
...
@@ -35,7 +35,7 @@ class AvgPoolDescriptor(Structure):
infiniopAvgPoolDescriptor_t
=
POINTER
(
AvgPoolDescriptor
)
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
pooling_layers
=
{
1
:
torch
.
nn
.
AvgPool1d
,
2
:
torch
.
nn
.
AvgPool2d
,
...
...
@@ -48,7 +48,9 @@ def pool(x, k, padding, stride, dilation = 1):
return
None
if
ndim
==
3
and
x
.
dtype
==
torch
.
float16
:
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
.
to
(
torch
.
float32
)).
to
(
torch
.
float16
)
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
.
to
(
torch
.
float32
)
).
to
(
torch
.
float16
)
else
:
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
)
if
PROFILE
:
...
...
@@ -69,12 +71,14 @@ def inferShape(x_shape, kernel_shape, padding, strides):
return
x_shape
[:
2
]
+
tuple
(
output_shape
)
# convert a python tuple to a ctype void pointer
def
tuple_to_void_p
(
py_tuple
:
Tuple
):
array
=
ctypes
.
c_int64
*
len
(
py_tuple
)
data_array
=
array
(
*
py_tuple
)
return
ctypes
.
cast
(
data_array
,
ctypes
.
c_void_p
)
def
test
(
lib
,
handle
,
...
...
@@ -90,7 +94,9 @@ def test(
)
x
=
torch
.
rand
(
x_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
ans
=
pool
(
x
,
k_shape
,
padding
,
strides
)
...
...
@@ -126,7 +132,9 @@ def test(
check_error
(
lib
.
infiniopGetAvgPoolWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
...
...
@@ -164,8 +172,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -173,8 +183,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -184,17 +196,21 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# x_shape, kernel_shape, padding, strides
((
1
,
1
,
10
),
(
3
,),
(
1
,),
(
1
,)),
((
32
,
3
,
224
,
224
),
(
3
,
3
),
(
1
,
1
),
(
2
,
2
)),
((
1
,
1
,
16
,
16
,
16
),
(
5
,
5
,
5
),
(
2
,
2
,
2
),
(
2
,
2
,
2
)),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/causal_softmax.py
View file @
3c31dc6c
...
...
@@ -101,6 +101,7 @@ def test_bang(lib, test_cases):
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
x_stride
)
destroy_handle
(
lib
,
handle
)
def
test_ascend
(
lib
,
test_cases
):
import
torch_npu
...
...
@@ -111,6 +112,7 @@ def test_ascend(lib, test_cases):
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# x_shape, x_stride
...
...
test/infiniop/conv.py
View file @
3c31dc6c
...
...
@@ -41,17 +41,11 @@ infiniopConvDescriptor_t = POINTER(ConvDescriptor)
def
conv
(
x
,
w
,
stride
,
padding
,
dilation
):
match
len
(
x
.
shape
)
-
2
:
case
1
:
return
F
.
conv1d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
return
F
.
conv1d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
2
:
return
F
.
conv2d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
return
F
.
conv2d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
3
:
return
F
.
conv3d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
return
F
.
conv3d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
_
:
print
(
"Error: Pytorch -> Unsupported tensor dimension"
)
return
None
...
...
@@ -66,11 +60,15 @@ def inferShape(
dilations
:
List
[
int
],
)
->
Tuple
[
int
,
...]:
assert
(
len
(
x_shape
)
==
len
(
w_shape
)
==
len
(
pads
)
+
2
==
len
(
dilations
)
+
2
==
len
(
strides
)
+
2
len
(
x_shape
)
==
len
(
w_shape
)
==
len
(
pads
)
+
2
==
len
(
dilations
)
+
2
==
len
(
strides
)
+
2
),
"x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
output_dims
=
[
math
.
floor
(
(
x_shape
[
i
+
2
]
+
2
*
pads
[
i
]
-
dilations
[
i
]
*
(
w_shape
[
i
+
2
]
-
1
)
-
1
)
(
x_shape
[
i
+
2
]
+
2
*
pads
[
i
]
-
dilations
[
i
]
*
(
w_shape
[
i
+
2
]
-
1
)
-
1
)
/
strides
[
i
]
+
1
)
...
...
@@ -145,7 +143,9 @@ def test(
check_error
(
lib
.
infiniopGetConvWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
...
...
@@ -177,7 +177,7 @@ def test(
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
print
(
f
" lib time:
{
elapsed
:
6
f
}
"
)
if
(
tensor_dtype
==
torch
.
float16
)
:
if
tensor_dtype
==
torch
.
float16
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-2
)
else
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-3
)
...
...
@@ -188,8 +188,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -197,8 +199,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -208,8 +212,10 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/expand.py
View file @
3c31dc6c
...
...
@@ -109,8 +109,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -118,8 +120,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -129,13 +133,16 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# y_shape, x_shape, y_stride, x_stride
((),
(),
None
,
None
),
((
3
,
3
),
(
1
,),
None
,
None
),
...
...
@@ -146,6 +153,7 @@ if __name__ == "__main__":
((
2
,
3
,
4
,
5
),
(
5
,),
None
,
None
),
((
3
,
2
,
4
,
5
),
(
3
,
2
,
1
,
1
),
None
,
None
),
((
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
1
),
None
,
None
),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
...
...
test/infiniop/gemm.py
View file @
3c31dc6c
...
...
@@ -27,6 +27,7 @@ PROFILE = False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
class
GEMMDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
...
...
@@ -34,10 +35,15 @@ class GEMMDescriptor(Structure):
infiniopGEMMDescriptor_t
=
POINTER
(
GEMMDescriptor
)
def
gemm
(
A
,
B
,
C
=
None
,
transA
=
False
,
transB
=
False
,
alpha
=
1.0
,
beta
=
0.0
,
dtype
=
torch
.
float32
):
def
gemm
(
A
,
B
,
C
=
None
,
transA
=
False
,
transB
=
False
,
alpha
=
1.0
,
beta
=
0.0
,
dtype
=
torch
.
float32
):
A
=
A
.
T
if
transA
else
A
B
=
B
.
T
if
transB
else
B
result
=
alpha
*
torch
.
matmul
(
A
if
dtype
!=
torch
.
float16
else
A
.
to
(
torch
.
float32
),
B
if
dtype
!=
torch
.
float16
else
B
.
to
(
torch
.
float32
)).
to
(
dtype
)
result
=
alpha
*
torch
.
matmul
(
A
if
dtype
!=
torch
.
float16
else
A
.
to
(
torch
.
float32
),
B
if
dtype
!=
torch
.
float16
else
B
.
to
(
torch
.
float32
),
).
to
(
dtype
)
if
C
is
not
None
:
result
+=
beta
*
C
if
dtype
!=
torch
.
float16
else
C
.
to
(
torch
.
float32
)
if
PROFILE
:
...
...
@@ -121,9 +127,7 @@ def test(
workspace_size
=
ctypes
.
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetGEMMWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
lib
.
infiniopGetGEMMWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
=
torch
.
zeros
(
int
(
workspace_size
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
...
...
@@ -182,8 +186,10 @@ def test_cpu(lib, test_cases):
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -204,8 +210,10 @@ def test_cuda(lib, test_cases):
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
@@ -229,9 +237,10 @@ def test_bang(lib, test_cases):
c_stride
,
y_stride
,
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/global_avg_pool.py
View file @
3c31dc6c
...
...
@@ -99,7 +99,12 @@ def test(
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
check_error
(
lib
.
infiniopGlobalAvgPool
(
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
None
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
None
,
)
)
if
PROFILE
:
...
...
test/infiniop/libinfiniop/__init__.py
View file @
3c31dc6c
import
os
import
sys
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'.'
)))
from
.liboperators
import
open_lib
,
CTensor
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"."
)))
from
.liboperators
import
(
open_lib
,
CTensor
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
)
from
.devices
import
*
from
.utils
import
*
from
.datatypes
import
*
test/infiniop/libinfiniop/datatypes.py
View file @
3c31dc6c
test/infiniop/libinfiniop/utils.py
View file @
3c31dc6c
...
...
@@ -54,6 +54,7 @@ def create_workspace(size, torch_device):
if
size
==
0
:
return
None
import
torch
return
torch
.
zeros
(
size
=
(
size
,),
dtype
=
torch
.
uint8
,
device
=
torch_device
)
...
...
@@ -172,6 +173,7 @@ def get_args():
def
synchronize_device
(
torch_device
):
import
torch
if
torch_device
==
"cuda"
:
torch
.
cuda
.
synchronize
()
elif
torch_device
==
"npu"
:
...
...
@@ -197,11 +199,22 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
If True, the function will print detailed information about any discrepancies between the tensors.
"""
import
numpy
as
np
print_discrepancy
(
actual
,
desired
,
atol
,
rtol
,
verbose
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
def
debug_all
(
actual_vals
:
Sequence
,
desired_vals
:
Sequence
,
condition
:
str
,
atol
=
0
,
rtol
=
1e-2
,
equal_nan
=
False
,
verbose
=
True
):
def
debug_all
(
actual_vals
:
Sequence
,
desired_vals
:
Sequence
,
condition
:
str
,
atol
=
0
,
rtol
=
1e-2
,
equal_nan
=
False
,
verbose
=
True
,
):
"""
Debugging function to compare two sequences of values (actual and desired) pair by pair, results
are linked by the given logical condition, and prints discrepancies
...
...
@@ -223,7 +236,10 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
- AssertionError: If the specified `condition` is not 'or' or 'and'.
"""
assert
len
(
actual_vals
)
==
len
(
desired_vals
),
"Invalid Length"
assert
condition
in
{
"or"
,
"and"
},
"Invalid condition: should be either 'or' or 'and'"
assert
condition
in
{
"or"
,
"and"
,
},
"Invalid condition: should be either 'or' or 'and'"
import
numpy
as
np
passed
=
False
if
condition
==
"or"
else
True
...
...
@@ -237,14 +253,22 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
elif
condition
==
"and"
:
if
passed
and
len
(
indices
)
!=
0
:
passed
=
False
print
(
f
"
\033
[31mThe condition has not been satisfied: Condition #
{
index
+
1
}
\033
[0m"
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
print
(
f
"
\033
[31mThe condition has not been satisfied: Condition #
{
index
+
1
}
\033
[0m"
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
,
)
assert
passed
,
"
\033
[31mThe condition has not been satisfied
\033
[0m"
def
print_discrepancy
(
actual
,
expected
,
atol
=
0
,
rtol
=
1e-3
,
verbose
=
True
):
def
print_discrepancy
(
actual
,
expected
,
atol
=
0
,
rtol
=
1e-3
,
verbose
=
True
):
if
actual
.
shape
!=
expected
.
shape
:
raise
ValueError
(
"Tensors must have the same shape to compare."
)
...
...
@@ -273,7 +297,9 @@ def print_discrepancy(
for
idx
in
diff_indices
:
index_tuple
=
tuple
(
idx
.
tolist
())
actual_str
=
f
"
{
actual
[
index_tuple
]:
<
{
col_width
[
1
]
}
.
{
decimal_places
[
1
]
}
f
}
"
expected_str
=
f
"
{
expected
[
index_tuple
]:
<
{
col_width
[
2
]
}
.
{
decimal_places
[
2
]
}
f
}
"
expected_str
=
(
f
"
{
expected
[
index_tuple
]:
<
{
col_width
[
2
]
}
.
{
decimal_places
[
2
]
}
f
}
"
)
delta_str
=
f
"
{
delta
[
index_tuple
]:
<
{
col_width
[
3
]
}
.
{
decimal_places
[
3
]
}
f
}
"
print
(
f
" > Index:
{
str
(
index_tuple
):
<
{
col_width
[
0
]
}}
"
...
...
@@ -287,10 +313,18 @@ def print_discrepancy(
print
(
f
" - Desired dtype:
{
expected
.
dtype
}
"
)
print
(
f
" - Atol:
{
atol
}
"
)
print
(
f
" - Rtol:
{
rtol
}
"
)
print
(
f
" - Mismatched elements:
{
len
(
diff_indices
)
}
/
{
actual
.
numel
()
}
(
{
len
(
diff_indices
)
/
actual
.
numel
()
*
100
}
%)"
)
print
(
f
" - Min(actual) :
{
torch
.
min
(
actual
):
<
{
col_width
[
1
]
}}
| Max(actual) :
{
torch
.
max
(
actual
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(desired):
{
torch
.
min
(
expected
):
<
{
col_width
[
1
]
}}
| Max(desired):
{
torch
.
max
(
expected
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(delta) :
{
torch
.
min
(
delta
):
<
{
col_width
[
1
]
}}
| Max(delta) :
{
torch
.
max
(
delta
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Mismatched elements:
{
len
(
diff_indices
)
}
/
{
actual
.
numel
()
}
(
{
len
(
diff_indices
)
/
actual
.
numel
()
*
100
}
%)"
)
print
(
f
" - Min(actual) :
{
torch
.
min
(
actual
):
<
{
col_width
[
1
]
}}
| Max(actual) :
{
torch
.
max
(
actual
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(desired):
{
torch
.
min
(
expected
):
<
{
col_width
[
1
]
}}
| Max(desired):
{
torch
.
max
(
expected
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(delta) :
{
torch
.
min
(
delta
):
<
{
col_width
[
1
]
}}
| Max(delta) :
{
torch
.
max
(
delta
):
<
{
col_width
[
2
]
}}
"
)
print
(
"-"
*
total_width
+
"
\n
"
)
return
diff_indices
...
...
@@ -301,11 +335,14 @@ def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3
Returns the atol and rtol for a given tensor data type in the tolerance_map.
If the given data type is not found, it returns the provided default tolerance values.
"""
return
tolerance_map
.
get
(
tensor_dtype
,
{
'atol'
:
default_atol
,
'rtol'
:
default_rtol
}).
values
()
return
tolerance_map
.
get
(
tensor_dtype
,
{
"atol"
:
default_atol
,
"rtol"
:
default_rtol
}
).
values
()
def
timed_op
(
func
,
num_iterations
,
device
):
import
time
""" Function for timing operations with synchronization. """
synchronize_device
(
device
)
start
=
time
.
time
()
...
...
@@ -355,7 +392,13 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
try
:
for
test_case
in
test_cases
:
for
tensor_dtype
in
tensor_dtypes
:
test_func
(
lib
,
handle
,
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
)
test_func
(
lib
,
handle
,
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
,
)
finally
:
destroy_handle
(
lib
,
handle
)
...
...
@@ -372,14 +415,18 @@ def get_test_devices(args):
"""
devices_to_test
=
[]
if
args
.
cpu
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
CPU
)
if
args
.
nvidia
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
NVIDIA
)
if
args
.
cpu
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
CPU
)
if
args
.
nvidia
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
NVIDIA
)
if
args
.
cambricon
:
import
torch_mlu
devices_to_test
.
append
(
InfiniDeviceEnum
.
CAMBRICON
)
if
args
.
ascend
:
import
torch
import
torch_npu
torch
.
npu
.
set_device
(
0
)
# Ascend NPU needs explicit device initialization
devices_to_test
.
append
(
InfiniDeviceEnum
.
ASCEND
)
if
not
devices_to_test
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment