Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
3c31dc6c
Unverified
Commit
3c31dc6c
authored
Feb 18, 2025
by
PanZezhong1725
Committed by
GitHub
Feb 18, 2025
Browse files
Merge pull request #45 from YdrMaster/main
issue/52 代码格式化:机制和效果
parents
16dad776
e5ed9fa1
Changes
48
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
424 additions
and
330 deletions
+424
-330
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+5
-14
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+1
-1
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+3
-3
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
+0
-2
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+2
-2
src/infiniop/ops/random_sample/operator.cc
src/infiniop/ops/random_sample/operator.cc
+68
-68
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+57
-57
src/infiniop/ops/rms_norm/operator.cc
src/infiniop/ops/rms_norm/operator.cc
+80
-80
src/infiniop/ops/utils.h
src/infiniop/ops/utils.h
+27
-37
test/infiniop/__init__.py
test/infiniop/__init__.py
+1
-1
test/infiniop/add.py
test/infiniop/add.py
+20
-4
test/infiniop/avg_pool.py
test/infiniop/avg_pool.py
+22
-6
test/infiniop/causal_softmax.py
test/infiniop/causal_softmax.py
+3
-1
test/infiniop/conv.py
test/infiniop/conv.py
+19
-13
test/infiniop/expand.py
test/infiniop/expand.py
+11
-3
test/infiniop/gemm.py
test/infiniop/gemm.py
+16
-7
test/infiniop/global_avg_pool.py
test/infiniop/global_avg_pool.py
+6
-1
test/infiniop/libinfiniop/__init__.py
test/infiniop/libinfiniop/__init__.py
+8
-2
test/infiniop/libinfiniop/datatypes.py
test/infiniop/libinfiniop/datatypes.py
+1
-1
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+74
-27
No files found.
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
View file @
3c31dc6c
...
@@ -38,8 +38,8 @@ cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) {
...
@@ -38,8 +38,8 @@ cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) {
template
<
typename
Tdata
>
template
<
typename
Tdata
>
infiniopStatus_t
cpuCalculateMatmul
(
infiniopMatmulCpuDescriptor_t
desc
,
void
*
c
,
infiniopStatus_t
cpuCalculateMatmul
(
infiniopMatmulCpuDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
)
{
float
alpha
)
{
auto
info
=
desc
->
info
;
auto
info
=
desc
->
info
;
if
(
info
.
is_transed
)
{
if
(
info
.
is_transed
)
{
...
@@ -49,20 +49,11 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
...
@@ -49,20 +49,11 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
for
(
size_t
i
=
0
;
i
<
info
.
batch
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
info
.
batch
;
++
i
)
{
for
(
size_t
m_
=
0
;
m_
<
info
.
m
;
++
m_
)
{
for
(
size_t
m_
=
0
;
m_
<
info
.
m
;
++
m_
)
{
for
(
size_t
n_
=
0
;
n_
<
info
.
n
;
++
n_
)
{
for
(
size_t
n_
=
0
;
n_
<
info
.
n
;
++
n_
)
{
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
float
sum
=
0
;
float
sum
=
0
;
for
(
size_t
k_
=
0
;
k_
<
info
.
k
;
++
k_
)
{
for
(
size_t
k_
=
0
;
k_
<
info
.
k
;
++
k_
)
{
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
i
*
info
.
a_matrix
.
stride
+
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
i
*
info
.
a_matrix
.
stride
+
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
if
constexpr
(
std
::
is_same
<
Tdata
,
uint16_t
>::
value
)
{
if
constexpr
(
std
::
is_same
<
Tdata
,
uint16_t
>::
value
)
{
sum
+=
f16_to_f32
(
*
a_
)
*
f16_to_f32
(
*
b_
);
sum
+=
f16_to_f32
(
*
a_
)
*
f16_to_f32
(
*
b_
);
}
else
{
}
else
{
...
...
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
View file @
3c31dc6c
#include "./matmul_cuda.cuh"
#include "../../utils.h"
#include "../../utils.h"
#include "./matmul_cuda.cuh"
infiniopStatus_t
cudaCreateMatmulDescriptor
(
infiniopCudaHandle_t
handle
,
infiniopStatus_t
cudaCreateMatmulDescriptor
(
infiniopCudaHandle_t
handle
,
infiniopMatmulCudaDescriptor_t
*
desc_ptr
,
infiniopMatmulCudaDescriptor_t
*
desc_ptr
,
...
...
src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
View file @
3c31dc6c
#ifndef __INFINIOP_MATMUL_CUDA_H__
#ifndef __INFINIOP_MATMUL_CUDA_H__
#define __INFINIOP_MATMUL_CUDA_H__
#define __INFINIOP_MATMUL_CUDA_H__
#include "matmul_cuda_api.h"
#include "../../../devices/cuda/common_cuda.cuh"
#include "../../../devices/cuda/common_cuda.cuh"
#include <memory>
#include "../blas.h"
#include "../blas.h"
#include "matmul_cuda_api.h"
#include <memory>
typedef
struct
InfiniopMatmulCudaDescriptor
{
typedef
struct
InfiniopMatmulCudaDescriptor
{
infiniDevice_t
device
;
infiniDevice_t
device
;
...
@@ -14,4 +14,4 @@ typedef struct InfiniopMatmulCudaDescriptor {
...
@@ -14,4 +14,4 @@ typedef struct InfiniopMatmulCudaDescriptor {
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
cublas_handle_pool
;
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
cublas_handle_pool
;
}
InfiniopMatmulCudaDescriptor
;
}
InfiniopMatmulCudaDescriptor
;
#endif// __INFINIOP_MATMUL_CUDA_H__
#endif
// __INFINIOP_MATMUL_CUDA_H__
src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
View file @
3c31dc6c
...
@@ -4,7 +4,6 @@
...
@@ -4,7 +4,6 @@
#include "../../../devices/cuda/cuda_handle.h"
#include "../../../devices/cuda/cuda_handle.h"
#include "infiniop/operator.h"
#include "infiniop/operator.h"
struct
InfiniopMatmulCudaDescriptor
;
struct
InfiniopMatmulCudaDescriptor
;
typedef
struct
InfiniopMatmulCudaDescriptor
*
infiniopMatmulCudaDescriptor_t
;
typedef
struct
InfiniopMatmulCudaDescriptor
*
infiniopMatmulCudaDescriptor_t
;
...
@@ -28,5 +27,4 @@ infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
...
@@ -28,5 +27,4 @@ infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
infiniopStatus_t
cudaDestroyMatmulDescriptor
(
infiniopMatmulCudaDescriptor_t
desc
);
infiniopStatus_t
cudaDestroyMatmulDescriptor
(
infiniopMatmulCudaDescriptor_t
desc
);
#endif // __INFINIOP_MATMUL_CUDA_API_H__
#endif // __INFINIOP_MATMUL_CUDA_API_H__
src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
View file @
3c31dc6c
#include "../../utils.h"
#include "../../utils.h"
#include "./matmul_cuda.cuh"
#include "./matmul_cuda.cuh"
template
<
typename
Tdata
>
template
<
typename
Tdata
>
infiniopStatus_t
cudaMatmulCublas
(
infiniopMatmulCudaDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
void
*
stream
)
{
infiniopStatus_t
cudaMatmulCublas
(
infiniopMatmulCudaDescriptor_t
desc
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
float
alpha
,
void
*
stream
)
{
auto
info
=
desc
->
info
;
auto
info
=
desc
->
info
;
...
@@ -26,7 +26,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c,
...
@@ -26,7 +26,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c,
auto
op_a
=
info
.
a_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_a
=
info
.
a_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_b
=
info
.
b_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_b
=
info
.
b_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
use_cublas
(
desc
->
cublas_handle_pool
,
desc
->
device_id
,
(
cudaStream_t
)
stream
,
use_cublas
(
desc
->
cublas_handle_pool
,
desc
->
device_id
,
(
cudaStream_t
)
stream
,
[
&
](
cublasHandle_t
handle
)
{
cublasGemmStridedBatchedEx
(
[
&
](
cublasHandle_t
handle
)
{
cublasGemmStridedBatchedEx
(
handle
,
handle
,
op_a
,
op_a
,
...
...
src/infiniop/ops/random_sample/operator.cc
View file @
3c31dc6c
...
@@ -3,36 +3,36 @@
...
@@ -3,36 +3,36 @@
__C
infiniopStatus_t
infiniopCreateRandomSampleDescriptor
(
infiniopHandle_t
handle
,
infiniopRandomSampleDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
result
,
infiniopTensorDescriptor_t
probs
)
{
__C
infiniopStatus_t
infiniopCreateRandomSampleDescriptor
(
infiniopHandle_t
handle
,
infiniopRandomSampleDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
result
,
infiniopTensorDescriptor_t
probs
)
{
switch
(
handle
->
device
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuCreateRandomSampleDescriptor
(
handle
,
(
RandomSampleCpuDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
return
cpuCreateRandomSampleDescriptor
(
handle
,
(
RandomSampleCpuDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
case
DevNvGpu
:
return
cudaCreateRandomSampleDescriptor
((
CudaHandle_t
)
handle
,
(
RandomSampleCudaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
return
cudaCreateRandomSampleDescriptor
((
CudaHandle_t
)
handle
,
(
RandomSampleCudaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangCreateRandomSampleDescriptor
((
BangHandle_t
)
handle
,
return
bangCreateRandomSampleDescriptor
((
BangHandle_t
)
handle
,
(
RandomSampleBangDescriptor_t
*
)
desc_ptr
,
result
,
(
RandomSampleBangDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
probs
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
ascendCreateRandomSampleDescriptor
((
AscendHandle_t
)
handle
,
return
ascendCreateRandomSampleDescriptor
((
AscendHandle_t
)
handle
,
(
RandomSampleAscendDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
(
RandomSampleAscendDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaCreateRandomSampleDescriptor
((
MacaHandle_t
)
handle
,
return
macaCreateRandomSampleDescriptor
((
MacaHandle_t
)
handle
,
(
RandomSampleMacaDescriptor_t
*
)
desc_ptr
,
result
,
(
RandomSampleMacaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
probs
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
case
DevMthreadsGpu
:
return
musaCreateRandomSampleDescriptor
((
MusaHandle_t
)
handle
,
(
RandomSampleMusaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
return
musaCreateRandomSampleDescriptor
((
MusaHandle_t
)
handle
,
(
RandomSampleMusaDescriptor_t
*
)
desc_ptr
,
result
,
probs
);
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -41,35 +41,35 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl
...
@@ -41,35 +41,35 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl
__C
infiniopStatus_t
infiniopGetRandomSampleWorkspaceSize
(
infiniopRandomSampleDescriptor_t
desc
,
uint64_t
*
size
)
{
__C
infiniopStatus_t
infiniopGetRandomSampleWorkspaceSize
(
infiniopRandomSampleDescriptor_t
desc
,
uint64_t
*
size
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuGetRandomSampleWorkspaceSize
((
RandomSampleCpuDescriptor_t
)
desc
,
size
);
return
cpuGetRandomSampleWorkspaceSize
((
RandomSampleCpuDescriptor_t
)
desc
,
size
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
case
DevNvGpu
:
{
return
cudaGetRandomSampleWorkspaceSize
((
RandomSampleCudaDescriptor_t
)
desc
,
size
);
return
cudaGetRandomSampleWorkspaceSize
((
RandomSampleCudaDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangGetRandomSampleWorkspaceSize
((
RandomSampleBangDescriptor_t
)
desc
,
size
);
return
bangGetRandomSampleWorkspaceSize
((
RandomSampleBangDescriptor_t
)
desc
,
size
);
// return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
// return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
ascendGetRandomSampleWorkspaceSize
((
RandomSampleAscendDescriptor_t
)
desc
,
size
);
return
ascendGetRandomSampleWorkspaceSize
((
RandomSampleAscendDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaGetRandomSampleWorkspaceSize
((
RandomSampleMacaDescriptor_t
)
desc
,
size
);
return
macaGetRandomSampleWorkspaceSize
((
RandomSampleMacaDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaGetRandomSampleWorkspaceSize
((
RandomSampleMusaDescriptor_t
)
desc
,
size
);
return
musaGetRandomSampleWorkspaceSize
((
RandomSampleMusaDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -87,31 +87,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
...
@@ -87,31 +87,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
void
*
stream
)
{
void
*
stream
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuRandomSample
((
RandomSampleCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
cpuRandomSample
((
RandomSampleCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
case
DevNvGpu
:
return
cudaRandomSample
((
RandomSampleCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
cudaRandomSample
((
RandomSampleCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangRandomSample
((
RandomSampleBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
bangRandomSample
((
RandomSampleBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
ascendRandomSample
((
RandomSampleAscendDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
ascendRandomSample
((
RandomSampleAscendDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaRandomSample
((
RandomSampleMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
macaRandomSample
((
RandomSampleMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
case
DevMthreadsGpu
:
return
musaRandomSample
((
RandomSampleMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
musaRandomSample
((
RandomSampleMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -120,31 +120,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
...
@@ -120,31 +120,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
__C
infiniopStatus_t
infiniopDestroyRandomSampleDescriptor
(
infiniopRandomSampleDescriptor_t
desc
)
{
__C
infiniopStatus_t
infiniopDestroyRandomSampleDescriptor
(
infiniopRandomSampleDescriptor_t
desc
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuDestroyRandomSampleDescriptor
((
RandomSampleCpuDescriptor_t
)
desc
);
return
cpuDestroyRandomSampleDescriptor
((
RandomSampleCpuDescriptor_t
)
desc
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
case
DevNvGpu
:
return
cudaDestroyRandomSampleDescriptor
((
RandomSampleCudaDescriptor_t
)
desc
);
return
cudaDestroyRandomSampleDescriptor
((
RandomSampleCudaDescriptor_t
)
desc
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangDestroyRandomSampleDescriptor
((
RandomSampleBangDescriptor_t
)
desc
);
return
bangDestroyRandomSampleDescriptor
((
RandomSampleBangDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
ascendDestroyRandomSampleDescriptor
((
RandomSampleAscendDescriptor_t
)
desc
);
return
ascendDestroyRandomSampleDescriptor
((
RandomSampleAscendDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaDestroyRandomSampleDescriptor
((
RandomSampleMacaDescriptor_t
)
desc
);
return
macaDestroyRandomSampleDescriptor
((
RandomSampleMacaDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
case
DevMthreadsGpu
:
return
musaDestroyRandomSampleDescriptor
((
RandomSampleMusaDescriptor_t
)
desc
);
return
musaDestroyRandomSampleDescriptor
((
RandomSampleMusaDescriptor_t
)
desc
);
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/rearrange/operator.cc
View file @
3c31dc6c
...
@@ -7,37 +7,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
...
@@ -7,37 +7,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
infiniopTensorDescriptor_t
src
)
{
infiniopTensorDescriptor_t
src
)
{
switch
(
handle
->
device
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuCreateRearrangeDescriptor
(
handle
,
(
RearrangeCpuDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
cpuCreateRearrangeDescriptor
(
handle
,
(
RearrangeCpuDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
case
DevNvGpu
:
{
return
cudaCreateRearrangeDescriptor
((
CudaHandle_t
)
handle
,
(
RearrangeCudaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
cudaCreateRearrangeDescriptor
((
CudaHandle_t
)
handle
,
(
RearrangeCudaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
}
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangCreateRearrangeDescriptor
((
BangHandle_t
)
handle
,
(
RearrangeBangDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
bangCreateRearrangeDescriptor
((
BangHandle_t
)
handle
,
(
RearrangeBangDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
aclnnCreateRearrangeDescriptor
((
AscendHandle_t
)
handle
,
return
aclnnCreateRearrangeDescriptor
((
AscendHandle_t
)
handle
,
(
RearrangeAclnnDescriptor_t
*
)
desc_ptr
,
(
RearrangeAclnnDescriptor_t
*
)
desc_ptr
,
dst
,
dst
,
src
);
src
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaCreateRearrangeDescriptor
((
MacaHandle_t
)
handle
,
(
RearrangeMacaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
macaCreateRearrangeDescriptor
((
MacaHandle_t
)
handle
,
(
RearrangeMacaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaCreateRearrangeDescriptor
((
MusaHandle_t
)
handle
,
(
RearrangeMusaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
return
musaCreateRearrangeDescriptor
((
MusaHandle_t
)
handle
,
(
RearrangeMusaDescriptor_t
*
)
desc_ptr
,
dst
,
src
);
}
}
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -46,37 +46,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
...
@@ -46,37 +46,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
__C
infiniopStatus_t
infiniopRearrange
(
infiniopRearrangeDescriptor_t
desc
,
void
*
dst
,
void
const
*
src
,
void
*
stream
)
{
__C
infiniopStatus_t
infiniopRearrange
(
infiniopRearrangeDescriptor_t
desc
,
void
*
dst
,
void
const
*
src
,
void
*
stream
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuRearrange
((
RearrangeCpuDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
cpuRearrange
((
RearrangeCpuDescriptor_t
)
desc
,
dst
,
src
,
stream
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
case
DevNvGpu
:
{
return
cudaRearrange
((
RearrangeCudaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
cudaRearrange
((
RearrangeCudaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangRearrange
((
RearrangeBangDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
bangRearrange
((
RearrangeBangDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
aclnnRearrange
((
RearrangeAclnnDescriptor_t
)
desc
,
return
aclnnRearrange
((
RearrangeAclnnDescriptor_t
)
desc
,
dst
,
dst
,
src
,
src
,
stream
);
stream
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaRearrange
((
RearrangeMacaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
macaRearrange
((
RearrangeMacaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaRearrange
((
RearrangeMusaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
return
musaRearrange
((
RearrangeMusaDescriptor_t
)
desc
,
dst
,
src
,
stream
);
}
}
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -85,34 +85,34 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
...
@@ -85,34 +85,34 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
__C
infiniopStatus_t
infiniopDestroyRearrangeDescriptor
(
infiniopRearrangeDescriptor_t
desc
)
{
__C
infiniopStatus_t
infiniopDestroyRearrangeDescriptor
(
infiniopRearrangeDescriptor_t
desc
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuDestroyRearrangeDescriptor
((
RearrangeCpuDescriptor_t
)
desc
);
return
cpuDestroyRearrangeDescriptor
((
RearrangeCpuDescriptor_t
)
desc
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
case
DevNvGpu
:
{
return
cudaDestroyRearrangeDescriptor
((
RearrangeCudaDescriptor_t
)
desc
);
return
cudaDestroyRearrangeDescriptor
((
RearrangeCudaDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangDestroyRearrangeDescriptor
((
RearrangeBangDescriptor_t
)
desc
);
return
bangDestroyRearrangeDescriptor
((
RearrangeBangDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
aclnnDestroyRearrangeDescriptor
((
RearrangeAclnnDescriptor_t
)
desc
);
return
aclnnDestroyRearrangeDescriptor
((
RearrangeAclnnDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaDestroyRearrangeDescriptor
((
RearrangeMacaDescriptor_t
)
desc
);
return
macaDestroyRearrangeDescriptor
((
RearrangeMacaDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaDestroyRearrangeDescriptor
((
RearrangeMusaDescriptor_t
)
desc
);
return
musaDestroyRearrangeDescriptor
((
RearrangeMusaDescriptor_t
)
desc
);
}
}
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/rms_norm/operator.cc
View file @
3c31dc6c
...
@@ -9,38 +9,38 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
...
@@ -9,38 +9,38 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
float
epsilon
)
{
float
epsilon
)
{
switch
(
handle
->
device
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuCreateRMSNormDescriptor
(
handle
,
(
RMSNormCpuDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
cpuCreateRMSNormDescriptor
(
handle
,
(
RMSNormCpuDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
case
DevNvGpu
:
{
return
cudaCreateRMSNormDescriptor
((
CudaHandle_t
)
handle
,
(
RMSNormCudaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
cudaCreateRMSNormDescriptor
((
CudaHandle_t
)
handle
,
(
RMSNormCudaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
}
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangCreateRMSNormDescriptor
((
BangHandle_t
)
handle
,
(
RMSNormBangDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
bangCreateRMSNormDescriptor
((
BangHandle_t
)
handle
,
(
RMSNormBangDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
aclnnCreateRMSNormDescriptor
((
AscendHandle_t
)
handle
,
return
aclnnCreateRMSNormDescriptor
((
AscendHandle_t
)
handle
,
(
RMSNormAclnnDescriptor_t
*
)
desc_ptr
,
(
RMSNormAclnnDescriptor_t
*
)
desc_ptr
,
y_desc
,
y_desc
,
x_desc
,
x_desc
,
w_desc
,
w_desc
,
epsilon
);
epsilon
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaCreateRMSNormDescriptor
((
MacaHandle_t
)
handle
,
(
RMSNormMacaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
macaCreateRMSNormDescriptor
((
MacaHandle_t
)
handle
,
(
RMSNormMacaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaCreateRMSNormDescriptor
((
MusaHandle_t
)
handle
,
(
RMSNormMusaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
return
musaCreateRMSNormDescriptor
((
MusaHandle_t
)
handle
,
(
RMSNormMusaDescriptor_t
*
)
desc_ptr
,
y_desc
,
x_desc
,
w_desc
,
epsilon
);
}
}
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -49,35 +49,35 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
...
@@ -49,35 +49,35 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
__C
infiniopStatus_t
infiniopGetRMSNormWorkspaceSize
(
infiniopRMSNormDescriptor_t
desc
,
uint64_t
*
size
)
{
__C
infiniopStatus_t
infiniopGetRMSNormWorkspaceSize
(
infiniopRMSNormDescriptor_t
desc
,
uint64_t
*
size
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuGetRMSNormWorkspaceSize
((
RMSNormCpuDescriptor_t
)
desc
,
size
);
return
cpuGetRMSNormWorkspaceSize
((
RMSNormCpuDescriptor_t
)
desc
,
size
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
case
DevNvGpu
:
{
return
cudaGetRMSNormWorkspaceSize
((
RMSNormCudaDescriptor_t
)
desc
,
size
);
return
cudaGetRMSNormWorkspaceSize
((
RMSNormCudaDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangGetRMSNormWorkspaceSize
((
RMSNormBangDescriptor_t
)
desc
,
size
);
return
bangGetRMSNormWorkspaceSize
((
RMSNormBangDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
aclnnGetRMSNormWorkspaceSize
((
RMSNormAclnnDescriptor_t
)
desc
,
return
aclnnGetRMSNormWorkspaceSize
((
RMSNormAclnnDescriptor_t
)
desc
,
size
);
size
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaGetRMSNormWorkspaceSize
((
RMSNormMacaDescriptor_t
)
desc
,
size
);
return
macaGetRMSNormWorkspaceSize
((
RMSNormMacaDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaGetRMSNormWorkspaceSize
((
RMSNormMusaDescriptor_t
)
desc
,
size
);
return
musaGetRMSNormWorkspaceSize
((
RMSNormMusaDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -87,40 +87,40 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
...
@@ -87,40 +87,40 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
void
*
y
,
void
const
*
x
,
void
const
*
w
,
void
*
stream
)
{
void
*
y
,
void
const
*
x
,
void
const
*
w
,
void
*
stream
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuRMSNorm
((
RMSNormCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
cpuRMSNorm
((
RMSNormCpuDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
case
DevNvGpu
:
{
return
cudaRMSNorm
((
RMSNormCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
cudaRMSNorm
((
RMSNormCudaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangRMSNorm
((
RMSNormBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
bangRMSNorm
((
RMSNormBangDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
aclnnRMSNorm
((
RMSNormAclnnDescriptor_t
)
desc
,
return
aclnnRMSNorm
((
RMSNormAclnnDescriptor_t
)
desc
,
workspace
,
workspace
,
workspace_size
,
workspace_size
,
y
,
y
,
x
,
x
,
w
,
w
,
stream
);
stream
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaRMSNorm
((
RMSNormMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
macaRMSNorm
((
RMSNormMacaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaRMSNorm
((
RMSNormMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
return
musaRMSNorm
((
RMSNormMusaDescriptor_t
)
desc
,
workspace
,
workspace_size
,
y
,
x
,
w
,
stream
);
}
}
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -129,34 +129,34 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
...
@@ -129,34 +129,34 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
__C
infiniopStatus_t
infiniopDestroyRMSNormDescriptor
(
infiniopRMSNormDescriptor_t
desc
)
{
__C
infiniopStatus_t
infiniopDestroyRMSNormDescriptor
(
infiniopRMSNormDescriptor_t
desc
)
{
switch
(
desc
->
device
)
{
switch
(
desc
->
device
)
{
#ifdef ENABLE_CPU
#ifdef ENABLE_CPU
case
DevCpu
:
case
DevCpu
:
return
cpuDestroyRMSNormDescriptor
((
RMSNormCpuDescriptor_t
)
desc
);
return
cpuDestroyRMSNormDescriptor
((
RMSNormCpuDescriptor_t
)
desc
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
{
case
DevNvGpu
:
{
return
cudaDestroyRMSNormDescriptor
((
RMSNormCudaDescriptor_t
)
desc
);
return
cudaDestroyRMSNormDescriptor
((
RMSNormCudaDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangDestroyRMSNormDescriptor
((
RMSNormBangDescriptor_t
)
desc
);
return
bangDestroyRMSNormDescriptor
((
RMSNormBangDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
case
DevAscendNpu
:
{
return
aclnnDestroyRMSNormDescriptor
((
RMSNormAclnnDescriptor_t
)
desc
);
return
aclnnDestroyRMSNormDescriptor
((
RMSNormAclnnDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
return
macaDestroyRMSNormDescriptor
((
RMSNormMacaDescriptor_t
)
desc
);
return
macaDestroyRMSNormDescriptor
((
RMSNormMacaDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
return
musaDestroyRMSNormDescriptor
((
RMSNormMusaDescriptor_t
)
desc
);
return
musaDestroyRMSNormDescriptor
((
RMSNormMusaDescriptor_t
)
desc
);
}
}
#endif
#endif
}
}
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/utils.h
View file @
3c31dc6c
...
@@ -13,28 +13,28 @@
...
@@ -13,28 +13,28 @@
#define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
#define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
#define CHECK_ERROR(call, target, errCode)
\
#define CHECK_ERROR(call, target, errCode) \
do {
\
do { \
if (auto value = (call); value == (target)) {
\
if (auto value = (call); value == (target)) { \
std::cerr << "Error: expected " << (target) << " but got "
\
std::cerr << "Error: expected " << (target) << " but got " \
<< value << " in file " << __FILE__ << ", function "
\
<< value << " in file " << __FILE__ << ", function " \
<< __func__ << ", line " << __LINE__ << std::endl;
\
<< __func__ << ", line " << __LINE__ << std::endl; \
return (errCode);
\
return (errCode); \
}
\
} \
} while (0)
} while (0)
#define CREATE_CHECK_ERROR(expr, value, target, errCode)
\
#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
expr;
\
expr; \
CHECK_ERROR(value, target, errCode)
CHECK_ERROR(value, target, errCode)
#define CHECK_STATUS(call, target)
\
#define CHECK_STATUS(call, target) \
do {
\
do { \
if (auto value = (call); value != (target)) {
\
if (auto value = (call); value != (target)) { \
std::cerr << "Error: expected " << (target) << " but got "
\
std::cerr << "Error: expected " << (target) << " but got " \
<< value << " in file " << __FILE__ << ", function "
\
<< value << " in file " << __FILE__ << ", function " \
<< __func__ << ", line " << __LINE__ << std::endl;
\
<< __func__ << ", line " << __LINE__ << std::endl; \
return value;
\
return value; \
}
\
} \
} while (0)
} while (0)
inline
std
::
vector
<
int64_t
>
getByteStrides
(
infiniopTensorDescriptor_t
desc
)
{
inline
std
::
vector
<
int64_t
>
getByteStrides
(
infiniopTensorDescriptor_t
desc
)
{
...
@@ -67,8 +67,7 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
...
@@ -67,8 +67,7 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
// compute broadcasted shape
// compute broadcasted shape
for
(
size_t
i
=
0
;
i
<
max_rank
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
max_rank
;
++
i
)
{
if
(
padded_shape1
[
i
]
==
padded_shape2
[
i
]
||
padded_shape1
[
i
]
==
1
||
if
(
padded_shape1
[
i
]
==
padded_shape2
[
i
]
||
padded_shape1
[
i
]
==
1
||
padded_shape2
[
i
]
==
1
)
{
padded_shape2
[
i
]
==
1
)
{
broadcast_shape
[
i
]
=
std
::
max
(
padded_shape1
[
i
],
padded_shape2
[
i
]);
broadcast_shape
[
i
]
=
std
::
max
(
padded_shape1
[
i
],
padded_shape2
[
i
]);
}
else
{
}
else
{
return
false
;
return
false
;
...
@@ -89,10 +88,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
...
@@ -89,10 +88,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
auto
broadcast_shape
=
broadcast_shape_
.
data
(),
auto
broadcast_shape
=
broadcast_shape_
.
data
(),
padded_shape1
=
padded_shape1_
.
data
(),
padded_shape1
=
padded_shape1_
.
data
(),
padded_shape2
=
padded_shape2_
.
data
();
padded_shape2
=
padded_shape2_
.
data
();
if
(
broadcast_ndim
!=
c
->
ndim
||
if
(
broadcast_ndim
!=
c
->
ndim
||
!
getBroadcastShape
(
a
->
shape
,
a
->
ndim
,
b
->
shape
,
b
->
ndim
,
broadcast_shape
,
padded_shape1
,
padded_shape2
,
broadcast_ndim
))
{
!
getBroadcastShape
(
a
->
shape
,
a
->
ndim
,
b
->
shape
,
b
->
ndim
,
broadcast_shape
,
padded_shape1
,
padded_shape2
,
broadcast_ndim
))
{
return
false
;
return
false
;
}
}
return
std
::
equal
(
broadcast_shape
,
broadcast_shape
+
broadcast_ndim
,
return
std
::
equal
(
broadcast_shape
,
broadcast_shape
+
broadcast_ndim
,
...
@@ -126,7 +122,6 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
...
@@ -126,7 +122,6 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
return
isValidBroadcastShape
(
a
,
b
,
c
,
std
::
max
(
a
->
ndim
,
b
->
ndim
));
return
isValidBroadcastShape
(
a
,
b
,
c
,
std
::
max
(
a
->
ndim
,
b
->
ndim
));
}
}
// permute the dimensions of a tensor descriptor
// permute the dimensions of a tensor descriptor
inline
infiniopTensorDescriptor_t
permute
(
infiniopTensorDescriptor_t
desc
,
inline
infiniopTensorDescriptor_t
permute
(
infiniopTensorDescriptor_t
desc
,
const
std
::
vector
<
size_t
>
&
order
)
{
const
std
::
vector
<
size_t
>
&
order
)
{
...
@@ -149,10 +144,9 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
...
@@ -149,10 +144,9 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
// check if the dimensions [dim_start, dim_end] of a tensor descriptor are
// check if the dimensions [dim_start, dim_end] of a tensor descriptor are
// contiguous
// contiguous
inline
bool
isContiguous
(
const
infiniopTensorDescriptor_t
&
desc
,
inline
bool
isContiguous
(
const
infiniopTensorDescriptor_t
&
desc
,
size_t
dim_start
,
size_t
dim_end
)
{
size_t
dim_start
,
size_t
dim_end
)
{
for
(
size_t
i
=
dim_start
+
1
;
i
<=
dim_end
;
i
++
)
{
for
(
size_t
i
=
dim_start
+
1
;
i
<=
dim_end
;
i
++
)
{
if
(
desc
->
strides
[
i
-
1
]
!=
if
(
desc
->
strides
[
i
-
1
]
!=
static_cast
<
int64_t
>
(
desc
->
shape
[
i
])
*
desc
->
strides
[
i
])
{
static_cast
<
int64_t
>
(
desc
->
shape
[
i
])
*
desc
->
strides
[
i
])
{
return
false
;
return
false
;
}
}
}
}
...
@@ -168,7 +162,7 @@ inline bool isContiguous(const infiniopTensorDescriptor_t &desc) {
...
@@ -168,7 +162,7 @@ inline bool isContiguous(const infiniopTensorDescriptor_t &desc) {
// merge the dimensions [dim_start, dim_end] of a tensor descriptor
// merge the dimensions [dim_start, dim_end] of a tensor descriptor
inline
infiniopTensorDescriptor_t
dimMerge
(
infiniopTensorDescriptor_t
desc
,
inline
infiniopTensorDescriptor_t
dimMerge
(
infiniopTensorDescriptor_t
desc
,
size_t
dim_start
,
size_t
dim_end
)
{
size_t
dim_start
,
size_t
dim_end
)
{
size_t
ndim
=
desc
->
ndim
;
size_t
ndim
=
desc
->
ndim
;
if
(
dim_start
>
dim_end
||
dim_end
>=
ndim
)
{
if
(
dim_start
>
dim_end
||
dim_end
>=
ndim
)
{
return
nullptr
;
return
nullptr
;
...
@@ -203,11 +197,10 @@ inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,
...
@@ -203,11 +197,10 @@ inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,
// split the dimension dim of a tensor descriptor into multiple dimensions
// split the dimension dim of a tensor descriptor into multiple dimensions
inline
infiniopTensorDescriptor_t
dimSplit
(
infiniopTensorDescriptor_t
desc
,
inline
infiniopTensorDescriptor_t
dimSplit
(
infiniopTensorDescriptor_t
desc
,
size_t
dim
,
size_t
dim
,
const
std
::
vector
<
size_t
>
&
dims
)
{
const
std
::
vector
<
size_t
>
&
dims
)
{
size_t
ndim
=
desc
->
ndim
;
size_t
ndim
=
desc
->
ndim
;
if
(
desc
->
shape
[
dim
]
!=
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
if
(
desc
->
shape
[
dim
]
!=
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
{}))
{
std
::
multiplies
{}))
{
return
nullptr
;
return
nullptr
;
}
}
size_t
new_ndim
=
ndim
+
dims
.
size
()
-
1
;
size_t
new_ndim
=
ndim
+
dims
.
size
()
-
1
;
...
@@ -221,10 +214,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
...
@@ -221,10 +214,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
}
}
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
new_shape
[
index
]
=
dims
[
i
];
new_shape
[
index
]
=
dims
[
i
];
new_strides
[
index
]
=
new_strides
[
index
]
=
desc
->
strides
[
dim
]
*
desc
->
shape
[
dim
]
/
std
::
accumulate
(
dims
.
begin
(),
dims
.
begin
()
+
i
+
1
,
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
());
desc
->
strides
[
dim
]
*
desc
->
shape
[
dim
]
/
std
::
accumulate
(
dims
.
begin
(),
dims
.
begin
()
+
i
+
1
,
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
());
index
++
;
index
++
;
}
}
for
(
size_t
i
=
dim
+
1
;
i
<
ndim
;
i
++
)
{
for
(
size_t
i
=
dim
+
1
;
i
<
ndim
;
i
++
)
{
...
...
test/infiniop/__init__.py
View file @
3c31dc6c
import
libinfiniop
import
libinfiniop
\ No newline at end of file
test/infiniop/add.py
View file @
3c31dc6c
...
@@ -41,8 +41,8 @@ def test(
...
@@ -41,8 +41,8 @@ def test(
lib
,
lib
,
handle
,
handle
,
torch_device
,
torch_device
,
c_shape
,
c_shape
,
a_shape
,
a_shape
,
b_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
tensor_dtype
=
torch
.
float16
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
...
@@ -56,13 +56,21 @@ def test(
...
@@ -56,13 +56,21 @@ def test(
a
=
torch
.
rand
(
a_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
a
=
torch
.
rand
(
a_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
b_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
b_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
c
=
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
c
=
(
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
ans
=
add
(
a
,
b
)
ans
=
add
(
a
,
b
)
a_tensor
=
to_tensor
(
a
,
lib
)
a_tensor
=
to_tensor
(
a
,
lib
)
b_tensor
=
to_tensor
(
b
,
lib
)
b_tensor
=
to_tensor
(
b
,
lib
)
c_tensor
=
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
c_tensor
=
(
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
)
descriptor
=
infiniopAddDescriptor_t
()
descriptor
=
infiniopAddDescriptor_t
()
check_error
(
check_error
(
...
@@ -91,8 +99,10 @@ def test_cpu(lib, test_cases):
...
@@ -91,8 +99,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -100,8 +110,10 @@ def test_cuda(lib, test_cases):
...
@@ -100,8 +110,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -111,13 +123,16 @@ def test_bang(lib, test_cases):
...
@@ -111,13 +123,16 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_cases
=
[
test_cases
=
[
# fmt: off
# c_shape, a_shape, b_shape, inplace
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
...
@@ -133,6 +148,7 @@ if __name__ == "__main__":
...
@@ -133,6 +148,7 @@ if __name__ == "__main__":
((
2
,
4
,
3
),
(
2
,
1
,
3
),
(
4
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
4
,
3
),
(
2
,
1
,
3
),
(
4
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
3
,
4
,
5
),
(
2
,
3
,
4
,
5
),
(
5
,),
Inplace
.
OUT_OF_PLACE
),
((
2
,
3
,
4
,
5
),
(
2
,
3
,
4
,
5
),
(
5
,),
Inplace
.
OUT_OF_PLACE
),
((
3
,
2
,
4
,
5
),
(
4
,
5
),
(
3
,
2
,
1
,
1
),
Inplace
.
OUT_OF_PLACE
),
((
3
,
2
,
4
,
5
),
(
4
,
5
),
(
3
,
2
,
1
,
1
),
Inplace
.
OUT_OF_PLACE
),
# fmt: on
]
]
args
=
get_args
()
args
=
get_args
()
lib
=
open_lib
()
lib
=
open_lib
()
...
...
test/infiniop/avg_pool.py
View file @
3c31dc6c
...
@@ -35,7 +35,7 @@ class AvgPoolDescriptor(Structure):
...
@@ -35,7 +35,7 @@ class AvgPoolDescriptor(Structure):
infiniopAvgPoolDescriptor_t
=
POINTER
(
AvgPoolDescriptor
)
infiniopAvgPoolDescriptor_t
=
POINTER
(
AvgPoolDescriptor
)
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
def
pool
(
x
,
k
,
padding
,
stride
,
dilation
=
1
):
pooling_layers
=
{
pooling_layers
=
{
1
:
torch
.
nn
.
AvgPool1d
,
1
:
torch
.
nn
.
AvgPool1d
,
2
:
torch
.
nn
.
AvgPool2d
,
2
:
torch
.
nn
.
AvgPool2d
,
...
@@ -48,7 +48,9 @@ def pool(x, k, padding, stride, dilation = 1):
...
@@ -48,7 +48,9 @@ def pool(x, k, padding, stride, dilation = 1):
return
None
return
None
if
ndim
==
3
and
x
.
dtype
==
torch
.
float16
:
if
ndim
==
3
and
x
.
dtype
==
torch
.
float16
:
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
.
to
(
torch
.
float32
)).
to
(
torch
.
float16
)
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
.
to
(
torch
.
float32
)
).
to
(
torch
.
float16
)
else
:
else
:
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
)
ans
=
pooling_layers
[
ndim
](
k
,
stride
=
stride
,
padding
=
padding
)(
x
)
if
PROFILE
:
if
PROFILE
:
...
@@ -69,18 +71,20 @@ def inferShape(x_shape, kernel_shape, padding, strides):
...
@@ -69,18 +71,20 @@ def inferShape(x_shape, kernel_shape, padding, strides):
return
x_shape
[:
2
]
+
tuple
(
output_shape
)
return
x_shape
[:
2
]
+
tuple
(
output_shape
)
# convert a python tuple to a ctype void pointer
# convert a python tuple to a ctype void pointer
def
tuple_to_void_p
(
py_tuple
:
Tuple
):
def
tuple_to_void_p
(
py_tuple
:
Tuple
):
array
=
ctypes
.
c_int64
*
len
(
py_tuple
)
array
=
ctypes
.
c_int64
*
len
(
py_tuple
)
data_array
=
array
(
*
py_tuple
)
data_array
=
array
(
*
py_tuple
)
return
ctypes
.
cast
(
data_array
,
ctypes
.
c_void_p
)
return
ctypes
.
cast
(
data_array
,
ctypes
.
c_void_p
)
def
test
(
def
test
(
lib
,
lib
,
handle
,
handle
,
torch_device
,
torch_device
,
x_shape
,
x_shape
,
k_shape
,
k_shape
,
padding
,
padding
,
strides
,
strides
,
tensor_dtype
=
torch
.
float16
,
tensor_dtype
=
torch
.
float16
,
...
@@ -90,7 +94,9 @@ def test(
...
@@ -90,7 +94,9 @@ def test(
)
)
x
=
torch
.
rand
(
x_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
x
=
torch
.
rand
(
x_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
rand
(
inferShape
(
x_shape
,
k_shape
,
padding
,
strides
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
ans
=
pool
(
x
,
k_shape
,
padding
,
strides
)
ans
=
pool
(
x
,
k_shape
,
padding
,
strides
)
...
@@ -126,7 +132,9 @@ def test(
...
@@ -126,7 +132,9 @@ def test(
check_error
(
check_error
(
lib
.
infiniopGetAvgPoolWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
lib
.
infiniopGetAvgPoolWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
...
@@ -164,8 +172,10 @@ def test_cpu(lib, test_cases):
...
@@ -164,8 +172,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -173,8 +183,10 @@ def test_cuda(lib, test_cases):
...
@@ -173,8 +183,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -184,17 +196,21 @@ def test_bang(lib, test_cases):
...
@@ -184,17 +196,21 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
for
x_shape
,
kernel_shape
,
padding
,
strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
kernel_shape
,
padding
,
strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_cases
=
[
test_cases
=
[
# fmt: off
# x_shape, kernel_shape, padding, strides
# x_shape, kernel_shape, padding, strides
((
1
,
1
,
10
),
(
3
,),
(
1
,),
(
1
,)),
((
1
,
1
,
10
),
(
3
,),
(
1
,),
(
1
,)),
((
32
,
3
,
224
,
224
),
(
3
,
3
),
(
1
,
1
),
(
2
,
2
)),
((
32
,
3
,
224
,
224
),
(
3
,
3
),
(
1
,
1
),
(
2
,
2
)),
((
1
,
1
,
16
,
16
,
16
),
(
5
,
5
,
5
),
(
2
,
2
,
2
),
(
2
,
2
,
2
)),
((
1
,
1
,
16
,
16
,
16
),
(
5
,
5
,
5
),
(
2
,
2
,
2
),
(
2
,
2
,
2
)),
# fmt: on
]
]
args
=
get_args
()
args
=
get_args
()
lib
=
open_lib
()
lib
=
open_lib
()
...
...
test/infiniop/causal_softmax.py
View file @
3c31dc6c
...
@@ -101,6 +101,7 @@ def test_bang(lib, test_cases):
...
@@ -101,6 +101,7 @@ def test_bang(lib, test_cases):
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
x_stride
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
x_stride
)
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
def
test_ascend
(
lib
,
test_cases
):
def
test_ascend
(
lib
,
test_cases
):
import
torch_npu
import
torch_npu
...
@@ -111,11 +112,12 @@ def test_ascend(lib, test_cases):
...
@@ -111,11 +112,12 @@ def test_ascend(lib, test_cases):
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_cases
=
[
test_cases
=
[
# x_shape, x_stride
# x_shape, x_stride
((
32
,
20
,
512
),
None
),
((
32
,
20
,
512
),
None
),
((
32
,
20
,
512
),
(
20480
,
512
,
1
)),
# Ascend 暂不支持非连续
((
32
,
20
,
512
),
(
20480
,
512
,
1
)),
# Ascend 暂不支持非连续
]
]
args
=
get_args
()
args
=
get_args
()
lib
=
open_lib
()
lib
=
open_lib
()
...
...
test/infiniop/conv.py
View file @
3c31dc6c
...
@@ -41,17 +41,11 @@ infiniopConvDescriptor_t = POINTER(ConvDescriptor)
...
@@ -41,17 +41,11 @@ infiniopConvDescriptor_t = POINTER(ConvDescriptor)
def
conv
(
x
,
w
,
stride
,
padding
,
dilation
):
def
conv
(
x
,
w
,
stride
,
padding
,
dilation
):
match
len
(
x
.
shape
)
-
2
:
match
len
(
x
.
shape
)
-
2
:
case
1
:
case
1
:
return
F
.
conv1d
(
return
F
.
conv1d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
2
:
case
2
:
return
F
.
conv2d
(
return
F
.
conv2d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
3
:
case
3
:
return
F
.
conv3d
(
return
F
.
conv3d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
case
_
:
case
_
:
print
(
"Error: Pytorch -> Unsupported tensor dimension"
)
print
(
"Error: Pytorch -> Unsupported tensor dimension"
)
return
None
return
None
...
@@ -66,11 +60,15 @@ def inferShape(
...
@@ -66,11 +60,15 @@ def inferShape(
dilations
:
List
[
int
],
dilations
:
List
[
int
],
)
->
Tuple
[
int
,
...]:
)
->
Tuple
[
int
,
...]:
assert
(
assert
(
len
(
x_shape
)
==
len
(
w_shape
)
==
len
(
pads
)
+
2
==
len
(
dilations
)
+
2
==
len
(
strides
)
+
2
len
(
x_shape
)
==
len
(
w_shape
)
==
len
(
pads
)
+
2
==
len
(
dilations
)
+
2
==
len
(
strides
)
+
2
),
"x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
),
"x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
output_dims
=
[
output_dims
=
[
math
.
floor
(
math
.
floor
(
(
x_shape
[
i
+
2
]
+
2
*
pads
[
i
]
-
dilations
[
i
]
*
(
w_shape
[
i
+
2
]
-
1
)
-
1
)
(
x_shape
[
i
+
2
]
+
2
*
pads
[
i
]
-
dilations
[
i
]
*
(
w_shape
[
i
+
2
]
-
1
)
-
1
)
/
strides
[
i
]
/
strides
[
i
]
+
1
+
1
)
)
...
@@ -145,7 +143,9 @@ def test(
...
@@ -145,7 +143,9 @@ def test(
check_error
(
check_error
(
lib
.
infiniopGetConvWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
lib
.
infiniopGetConvWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
)
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
workspace_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
))
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
...
@@ -177,7 +177,7 @@ def test(
...
@@ -177,7 +177,7 @@ def test(
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
print
(
f
" lib time:
{
elapsed
:
6
f
}
"
)
print
(
f
" lib time:
{
elapsed
:
6
f
}
"
)
if
(
tensor_dtype
==
torch
.
float16
)
:
if
tensor_dtype
==
torch
.
float16
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-2
)
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-2
)
else
:
else
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-3
)
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-3
)
...
@@ -188,8 +188,10 @@ def test_cpu(lib, test_cases):
...
@@ -188,8 +188,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -197,8 +199,10 @@ def test_cuda(lib, test_cases):
...
@@ -197,8 +199,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -208,8 +212,10 @@ def test_bang(lib, test_cases):
...
@@ -208,8 +212,10 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/expand.py
View file @
3c31dc6c
...
@@ -47,10 +47,10 @@ def test(
...
@@ -47,10 +47,10 @@ def test(
lib
,
lib
,
handle
,
handle
,
torch_device
,
torch_device
,
y_shape
,
y_shape
,
x_shape
,
x_shape
,
y_stride
=
None
,
y_stride
=
None
,
x_stride
=
None
,
x_stride
=
None
,
tensor_dtype
=
torch
.
float16
,
tensor_dtype
=
torch
.
float16
,
):
):
print
(
print
(
...
@@ -109,8 +109,10 @@ def test_cpu(lib, test_cases):
...
@@ -109,8 +109,10 @@ def test_cpu(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CPU
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"cpu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -118,8 +120,10 @@ def test_cuda(lib, test_cases):
...
@@ -118,8 +120,10 @@ def test_cuda(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_CUDA
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"cuda"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -129,13 +133,16 @@ def test_bang(lib, test_cases):
...
@@ -129,13 +133,16 @@ def test_bang(lib, test_cases):
device
=
DeviceEnum
.
DEVICE_BANG
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
handle
=
create_handle
(
lib
,
device
)
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
for
y_shape
,
x_shape
,
y_stride
,
x_stride
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"mlu"
,
y_shape
,
x_shape
,
y_stride
,
x_stride
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_cases
=
[
test_cases
=
[
# fmt: off
# y_shape, x_shape, y_stride, x_stride
# y_shape, x_shape, y_stride, x_stride
((),
(),
None
,
None
),
((),
(),
None
,
None
),
((
3
,
3
),
(
1
,),
None
,
None
),
((
3
,
3
),
(
1
,),
None
,
None
),
...
@@ -146,6 +153,7 @@ if __name__ == "__main__":
...
@@ -146,6 +153,7 @@ if __name__ == "__main__":
((
2
,
3
,
4
,
5
),
(
5
,),
None
,
None
),
((
2
,
3
,
4
,
5
),
(
5
,),
None
,
None
),
((
3
,
2
,
4
,
5
),
(
3
,
2
,
1
,
1
),
None
,
None
),
((
3
,
2
,
4
,
5
),
(
3
,
2
,
1
,
1
),
None
,
None
),
((
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
1
),
None
,
None
),
((
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
1
),
None
,
None
),
# fmt: on
]
]
args
=
get_args
()
args
=
get_args
()
lib
=
open_lib
()
lib
=
open_lib
()
...
...
test/infiniop/gemm.py
View file @
3c31dc6c
...
@@ -27,6 +27,7 @@ PROFILE = False
...
@@ -27,6 +27,7 @@ PROFILE = False
NUM_PRERUN
=
10
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
NUM_ITERATIONS
=
1000
class
GEMMDescriptor
(
Structure
):
class
GEMMDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
_fields_
=
[(
"device"
,
c_int32
)]
...
@@ -34,10 +35,15 @@ class GEMMDescriptor(Structure):
...
@@ -34,10 +35,15 @@ class GEMMDescriptor(Structure):
infiniopGEMMDescriptor_t
=
POINTER
(
GEMMDescriptor
)
infiniopGEMMDescriptor_t
=
POINTER
(
GEMMDescriptor
)
def
gemm
(
A
,
B
,
C
=
None
,
transA
=
False
,
transB
=
False
,
alpha
=
1.0
,
beta
=
0.0
,
dtype
=
torch
.
float32
):
def
gemm
(
A
,
B
,
C
=
None
,
transA
=
False
,
transB
=
False
,
alpha
=
1.0
,
beta
=
0.0
,
dtype
=
torch
.
float32
):
A
=
A
.
T
if
transA
else
A
A
=
A
.
T
if
transA
else
A
B
=
B
.
T
if
transB
else
B
B
=
B
.
T
if
transB
else
B
result
=
alpha
*
torch
.
matmul
(
A
if
dtype
!=
torch
.
float16
else
A
.
to
(
torch
.
float32
),
B
if
dtype
!=
torch
.
float16
else
B
.
to
(
torch
.
float32
)).
to
(
dtype
)
result
=
alpha
*
torch
.
matmul
(
A
if
dtype
!=
torch
.
float16
else
A
.
to
(
torch
.
float32
),
B
if
dtype
!=
torch
.
float16
else
B
.
to
(
torch
.
float32
),
).
to
(
dtype
)
if
C
is
not
None
:
if
C
is
not
None
:
result
+=
beta
*
C
if
dtype
!=
torch
.
float16
else
C
.
to
(
torch
.
float32
)
result
+=
beta
*
C
if
dtype
!=
torch
.
float16
else
C
.
to
(
torch
.
float32
)
if
PROFILE
:
if
PROFILE
:
...
@@ -64,7 +70,7 @@ def test(
...
@@ -64,7 +70,7 @@ def test(
dtype
=
torch
.
float16
,
dtype
=
torch
.
float16
,
):
):
print
(
print
(
f
"Testing GEMM on
{
torch_device
}
with transA:
{
transA
}
transB:
{
transB
}
"
f
"Testing GEMM on
{
torch_device
}
with transA:
{
transA
}
transB:
{
transB
}
"
f
"a_shape:
{
a_shape
}
b_shape:
{
b_shape
}
c_shape:
{
c_shape
}
y_shape:
{
y_shape
}
"
f
"a_shape:
{
a_shape
}
b_shape:
{
b_shape
}
c_shape:
{
c_shape
}
y_shape:
{
y_shape
}
"
f
"a_stride:
{
a_stride
}
b_stride:
{
b_stride
}
c_stride:
{
c_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
f
"a_stride:
{
a_stride
}
b_stride:
{
b_stride
}
c_stride:
{
c_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
)
)
...
@@ -121,9 +127,7 @@ def test(
...
@@ -121,9 +127,7 @@ def test(
workspace_size
=
ctypes
.
c_uint64
(
0
)
workspace_size
=
ctypes
.
c_uint64
(
0
)
check_error
(
check_error
(
lib
.
infiniopGetGEMMWorkspaceSize
(
lib
.
infiniopGetGEMMWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
)
)
workspace
=
torch
.
zeros
(
int
(
workspace_size
.
value
),
dtype
=
torch
.
uint8
).
to
(
workspace
=
torch
.
zeros
(
int
(
workspace_size
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
torch_device
...
@@ -182,8 +186,10 @@ def test_cpu(lib, test_cases):
...
@@ -182,8 +186,10 @@ def test_cpu(lib, test_cases):
c_stride
,
c_stride
,
y_stride
,
y_stride
,
)
in
test_cases
:
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"cpu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -204,8 +210,10 @@ def test_cuda(lib, test_cases):
...
@@ -204,8 +210,10 @@ def test_cuda(lib, test_cases):
c_stride
,
c_stride
,
y_stride
,
y_stride
,
)
in
test_cases
:
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"cuda"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -229,9 +237,10 @@ def test_bang(lib, test_cases):
...
@@ -229,9 +237,10 @@ def test_bang(lib, test_cases):
c_stride
,
c_stride
,
y_stride
,
y_stride
,
)
in
test_cases
:
)
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
test
(
lib
,
handle
,
"mlu"
,
alpha
,
beta
,
transA
,
transB
,
a_shape
,
b_shape
,
c_shape
,
y_shape
,
a_stride
,
b_stride
,
c_stride
,
y_stride
,
dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
...
test/infiniop/global_avg_pool.py
View file @
3c31dc6c
...
@@ -99,7 +99,12 @@ def test(
...
@@ -99,7 +99,12 @@ def test(
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
check_error
(
check_error
(
lib
.
infiniopGlobalAvgPool
(
lib
.
infiniopGlobalAvgPool
(
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
None
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
None
,
)
)
)
)
if
PROFILE
:
if
PROFILE
:
...
...
test/infiniop/libinfiniop/__init__.py
View file @
3c31dc6c
import
os
import
os
import
sys
import
sys
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'.'
)))
from
.liboperators
import
open_lib
,
CTensor
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"."
)))
from
.liboperators
import
(
open_lib
,
CTensor
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
)
from
.devices
import
*
from
.devices
import
*
from
.utils
import
*
from
.utils
import
*
from
.datatypes
import
*
from
.datatypes
import
*
test/infiniop/libinfiniop/datatypes.py
View file @
3c31dc6c
...
@@ -7,7 +7,7 @@ class InfiniDtype:
...
@@ -7,7 +7,7 @@ class InfiniDtype:
I32
=
5
I32
=
5
I64
=
6
I64
=
6
U8
=
7
U8
=
7
U16
=
8
U16
=
8
U32
=
9
U32
=
9
U64
=
10
U64
=
10
F8
=
11
F8
=
11
...
...
test/infiniop/libinfiniop/utils.py
View file @
3c31dc6c
...
@@ -54,6 +54,7 @@ def create_workspace(size, torch_device):
...
@@ -54,6 +54,7 @@ def create_workspace(size, torch_device):
if
size
==
0
:
if
size
==
0
:
return
None
return
None
import
torch
import
torch
return
torch
.
zeros
(
size
=
(
size
,),
dtype
=
torch
.
uint8
,
device
=
torch_device
)
return
torch
.
zeros
(
size
=
(
size
,),
dtype
=
torch
.
uint8
,
device
=
torch_device
)
...
@@ -172,6 +173,7 @@ def get_args():
...
@@ -172,6 +173,7 @@ def get_args():
def
synchronize_device
(
torch_device
):
def
synchronize_device
(
torch_device
):
import
torch
import
torch
if
torch_device
==
"cuda"
:
if
torch_device
==
"cuda"
:
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
elif
torch_device
==
"npu"
:
elif
torch_device
==
"npu"
:
...
@@ -197,13 +199,24 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
...
@@ -197,13 +199,24 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
If True, the function will print detailed information about any discrepancies between the tensors.
If True, the function will print detailed information about any discrepancies between the tensors.
"""
"""
import
numpy
as
np
import
numpy
as
np
print_discrepancy
(
actual
,
desired
,
atol
,
rtol
,
verbose
)
print_discrepancy
(
actual
,
desired
,
atol
,
rtol
,
verbose
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
def
debug_all
(
actual_vals
:
Sequence
,
desired_vals
:
Sequence
,
condition
:
str
,
atol
=
0
,
rtol
=
1e-2
,
equal_nan
=
False
,
verbose
=
True
):
def
debug_all
(
actual_vals
:
Sequence
,
desired_vals
:
Sequence
,
condition
:
str
,
atol
=
0
,
rtol
=
1e-2
,
equal_nan
=
False
,
verbose
=
True
,
):
"""
"""
Debugging function to compare two sequences of values (actual and desired) pair by pair, results
Debugging function to compare two sequences of values (actual and desired) pair by pair, results
are linked by the given logical condition, and prints discrepancies
are linked by the given logical condition, and prints discrepancies
Arguments:
Arguments:
----------
----------
...
@@ -223,7 +236,10 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
...
@@ -223,7 +236,10 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
- AssertionError: If the specified `condition` is not 'or' or 'and'.
- AssertionError: If the specified `condition` is not 'or' or 'and'.
"""
"""
assert
len
(
actual_vals
)
==
len
(
desired_vals
),
"Invalid Length"
assert
len
(
actual_vals
)
==
len
(
desired_vals
),
"Invalid Length"
assert
condition
in
{
"or"
,
"and"
},
"Invalid condition: should be either 'or' or 'and'"
assert
condition
in
{
"or"
,
"and"
,
},
"Invalid condition: should be either 'or' or 'and'"
import
numpy
as
np
import
numpy
as
np
passed
=
False
if
condition
==
"or"
else
True
passed
=
False
if
condition
==
"or"
else
True
...
@@ -237,14 +253,22 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
...
@@ -237,14 +253,22 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
elif
condition
==
"and"
:
elif
condition
==
"and"
:
if
passed
and
len
(
indices
)
!=
0
:
if
passed
and
len
(
indices
)
!=
0
:
passed
=
False
passed
=
False
print
(
f
"
\033
[31mThe condition has not been satisfied: Condition #
{
index
+
1
}
\033
[0m"
)
print
(
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
)
f
"
\033
[31mThe condition has not been satisfied: Condition #
{
index
+
1
}
\033
[0m"
)
np
.
testing
.
assert_allclose
(
actual
.
cpu
(),
desired
.
cpu
(),
rtol
,
atol
,
equal_nan
,
verbose
=
True
,
strict
=
True
,
)
assert
passed
,
"
\033
[31mThe condition has not been satisfied
\033
[0m"
assert
passed
,
"
\033
[31mThe condition has not been satisfied
\033
[0m"
def
print_discrepancy
(
def
print_discrepancy
(
actual
,
expected
,
atol
=
0
,
rtol
=
1e-3
,
verbose
=
True
):
actual
,
expected
,
atol
=
0
,
rtol
=
1e-3
,
verbose
=
True
):
if
actual
.
shape
!=
expected
.
shape
:
if
actual
.
shape
!=
expected
.
shape
:
raise
ValueError
(
"Tensors must have the same shape to compare."
)
raise
ValueError
(
"Tensors must have the same shape to compare."
)
...
@@ -273,7 +297,9 @@ def print_discrepancy(
...
@@ -273,7 +297,9 @@ def print_discrepancy(
for
idx
in
diff_indices
:
for
idx
in
diff_indices
:
index_tuple
=
tuple
(
idx
.
tolist
())
index_tuple
=
tuple
(
idx
.
tolist
())
actual_str
=
f
"
{
actual
[
index_tuple
]:
<
{
col_width
[
1
]
}
.
{
decimal_places
[
1
]
}
f
}
"
actual_str
=
f
"
{
actual
[
index_tuple
]:
<
{
col_width
[
1
]
}
.
{
decimal_places
[
1
]
}
f
}
"
expected_str
=
f
"
{
expected
[
index_tuple
]:
<
{
col_width
[
2
]
}
.
{
decimal_places
[
2
]
}
f
}
"
expected_str
=
(
f
"
{
expected
[
index_tuple
]:
<
{
col_width
[
2
]
}
.
{
decimal_places
[
2
]
}
f
}
"
)
delta_str
=
f
"
{
delta
[
index_tuple
]:
<
{
col_width
[
3
]
}
.
{
decimal_places
[
3
]
}
f
}
"
delta_str
=
f
"
{
delta
[
index_tuple
]:
<
{
col_width
[
3
]
}
.
{
decimal_places
[
3
]
}
f
}
"
print
(
print
(
f
" > Index:
{
str
(
index_tuple
):
<
{
col_width
[
0
]
}}
"
f
" > Index:
{
str
(
index_tuple
):
<
{
col_width
[
0
]
}}
"
...
@@ -287,10 +313,18 @@ def print_discrepancy(
...
@@ -287,10 +313,18 @@ def print_discrepancy(
print
(
f
" - Desired dtype:
{
expected
.
dtype
}
"
)
print
(
f
" - Desired dtype:
{
expected
.
dtype
}
"
)
print
(
f
" - Atol:
{
atol
}
"
)
print
(
f
" - Atol:
{
atol
}
"
)
print
(
f
" - Rtol:
{
rtol
}
"
)
print
(
f
" - Rtol:
{
rtol
}
"
)
print
(
f
" - Mismatched elements:
{
len
(
diff_indices
)
}
/
{
actual
.
numel
()
}
(
{
len
(
diff_indices
)
/
actual
.
numel
()
*
100
}
%)"
)
print
(
print
(
f
" - Min(actual) :
{
torch
.
min
(
actual
):
<
{
col_width
[
1
]
}}
| Max(actual) :
{
torch
.
max
(
actual
):
<
{
col_width
[
2
]
}}
"
)
f
" - Mismatched elements:
{
len
(
diff_indices
)
}
/
{
actual
.
numel
()
}
(
{
len
(
diff_indices
)
/
actual
.
numel
()
*
100
}
%)"
print
(
f
" - Min(desired):
{
torch
.
min
(
expected
):
<
{
col_width
[
1
]
}}
| Max(desired):
{
torch
.
max
(
expected
):
<
{
col_width
[
2
]
}}
"
)
)
print
(
f
" - Min(delta) :
{
torch
.
min
(
delta
):
<
{
col_width
[
1
]
}}
| Max(delta) :
{
torch
.
max
(
delta
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(actual) :
{
torch
.
min
(
actual
):
<
{
col_width
[
1
]
}}
| Max(actual) :
{
torch
.
max
(
actual
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(desired):
{
torch
.
min
(
expected
):
<
{
col_width
[
1
]
}}
| Max(desired):
{
torch
.
max
(
expected
):
<
{
col_width
[
2
]
}}
"
)
print
(
f
" - Min(delta) :
{
torch
.
min
(
delta
):
<
{
col_width
[
1
]
}}
| Max(delta) :
{
torch
.
max
(
delta
):
<
{
col_width
[
2
]
}}
"
)
print
(
"-"
*
total_width
+
"
\n
"
)
print
(
"-"
*
total_width
+
"
\n
"
)
return
diff_indices
return
diff_indices
...
@@ -298,14 +332,17 @@ def print_discrepancy(
...
@@ -298,14 +332,17 @@ def print_discrepancy(
def
get_tolerance
(
tolerance_map
,
tensor_dtype
,
default_atol
=
0
,
default_rtol
=
1e-3
):
def
get_tolerance
(
tolerance_map
,
tensor_dtype
,
default_atol
=
0
,
default_rtol
=
1e-3
):
"""
"""
Returns the atol and rtol for a given tensor data type in the tolerance_map.
Returns the atol and rtol for a given tensor data type in the tolerance_map.
If the given data type is not found, it returns the provided default tolerance values.
If the given data type is not found, it returns the provided default tolerance values.
"""
"""
return
tolerance_map
.
get
(
tensor_dtype
,
{
'atol'
:
default_atol
,
'rtol'
:
default_rtol
}).
values
()
return
tolerance_map
.
get
(
tensor_dtype
,
{
"atol"
:
default_atol
,
"rtol"
:
default_rtol
}
).
values
()
def
timed_op
(
func
,
num_iterations
,
device
):
def
timed_op
(
func
,
num_iterations
,
device
):
import
time
import
time
""" Function for timing operations with synchronization. """
""" Function for timing operations with synchronization. """
synchronize_device
(
device
)
synchronize_device
(
device
)
start
=
time
.
time
()
start
=
time
.
time
()
...
@@ -318,7 +355,7 @@ def timed_op(func, num_iterations, device):
...
@@ -318,7 +355,7 @@ def timed_op(func, num_iterations, device):
def
profile_operation
(
desc
,
func
,
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
):
def
profile_operation
(
desc
,
func
,
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
):
"""
"""
Unified profiling workflow that is used to profile the execution time of a given function.
Unified profiling workflow that is used to profile the execution time of a given function.
It first performs a number of warmup runs, then performs timed execution and
It first performs a number of warmup runs, then performs timed execution and
prints the average execution time.
prints the average execution time.
Arguments:
Arguments:
...
@@ -328,11 +365,11 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
...
@@ -328,11 +365,11 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
- torch_device (str): The device on which the operation runs, provided for timed execution.
- torch_device (str): The device on which the operation runs, provided for timed execution.
- NUM_PRERUN (int): The number of warmup runs.
- NUM_PRERUN (int): The number of warmup runs.
- NUM_ITERATIONS (int): The number of timed execution iterations, used to calculate the average execution time.
- NUM_ITERATIONS (int): The number of timed execution iterations, used to calculate the average execution time.
"""
"""
# Warmup runs
# Warmup runs
for
_
in
range
(
NUM_PRERUN
):
for
_
in
range
(
NUM_PRERUN
):
func
()
func
()
# Timed execution
# Timed execution
elapsed
=
timed_op
(
lambda
:
func
(),
NUM_ITERATIONS
,
torch_device
)
elapsed
=
timed_op
(
lambda
:
func
(),
NUM_ITERATIONS
,
torch_device
)
print
(
f
"
{
desc
}
time:
{
elapsed
*
1000
:
6
f
}
ms"
)
print
(
f
"
{
desc
}
time:
{
elapsed
*
1000
:
6
f
}
ms"
)
...
@@ -347,7 +384,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
...
@@ -347,7 +384,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
- lib (ctypes.CDLL): The library object containing the operator implementations.
- lib (ctypes.CDLL): The library object containing the operator implementations.
- device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py.
- device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py.
- test_func (function): The test function to be executed for each test case.
- test_func (function): The test function to be executed for each test case.
- test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
- test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
to be passed to `test_func`.
to be passed to `test_func`.
- tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
- tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
"""
"""
...
@@ -355,7 +392,13 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
...
@@ -355,7 +392,13 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
try
:
try
:
for
test_case
in
test_cases
:
for
test_case
in
test_cases
:
for
tensor_dtype
in
tensor_dtypes
:
for
tensor_dtype
in
tensor_dtypes
:
test_func
(
lib
,
handle
,
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
)
test_func
(
lib
,
handle
,
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
,
)
finally
:
finally
:
destroy_handle
(
lib
,
handle
)
destroy_handle
(
lib
,
handle
)
...
@@ -365,22 +408,26 @@ def get_test_devices(args):
...
@@ -365,22 +408,26 @@ def get_test_devices(args):
Using the given parsed Namespace to determine the devices to be tested.
Using the given parsed Namespace to determine the devices to be tested.
Argument:
Argument:
- args: the parsed Namespace object.
- args: the parsed Namespace object.
Return:
Return:
- devices_to_test: the devices that will be tested. Default is CPU.
- devices_to_test: the devices that will be tested. Default is CPU.
"""
"""
devices_to_test
=
[]
devices_to_test
=
[]
if
args
.
cpu
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
CPU
)
if
args
.
cpu
:
if
args
.
nvidia
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
NVIDIA
)
devices_to_test
.
append
(
InfiniDeviceEnum
.
CPU
)
if
args
.
cambricon
:
if
args
.
nvidia
:
devices_to_test
.
append
(
InfiniDeviceEnum
.
NVIDIA
)
if
args
.
cambricon
:
import
torch_mlu
import
torch_mlu
devices_to_test
.
append
(
InfiniDeviceEnum
.
CAMBRICON
)
devices_to_test
.
append
(
InfiniDeviceEnum
.
CAMBRICON
)
if
args
.
ascend
:
if
args
.
ascend
:
import
torch
import
torch
import
torch_npu
import
torch_npu
torch
.
npu
.
set_device
(
0
)
# Ascend NPU needs explicit device initialization
torch
.
npu
.
set_device
(
0
)
# Ascend NPU needs explicit device initialization
devices_to_test
.
append
(
InfiniDeviceEnum
.
ASCEND
)
devices_to_test
.
append
(
InfiniDeviceEnum
.
ASCEND
)
if
not
devices_to_test
:
if
not
devices_to_test
:
devices_to_test
=
[
InfiniDeviceEnum
.
CPU
]
devices_to_test
=
[
InfiniDeviceEnum
.
CPU
]
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment