Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
d417f967
Unverified
Commit
d417f967
authored
Jul 11, 2025
by
zhiwu zhou
Committed by
GitHub
Jul 11, 2025
Browse files
Issue/213 添加conv算子cpu/cuda实现
parent
c6a3e4c7
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
1433 additions
and
214 deletions
+1433
-214
include/infiniop/ops/conv.h
include/infiniop/ops/conv.h
+5
-4
src/infiniop/ops/conv/conv.h
src/infiniop/ops/conv/conv.h
+56
-0
src/infiniop/ops/conv/cpu/conv_cpu.cc
src/infiniop/ops/conv/cpu/conv_cpu.cc
+363
-0
src/infiniop/ops/conv/cpu/conv_cpu.h
src/infiniop/ops/conv/cpu/conv_cpu.h
+8
-0
src/infiniop/ops/conv/cuda/conv_cuda.cu
src/infiniop/ops/conv/cuda/conv_cuda.cu
+429
-0
src/infiniop/ops/conv/cuda/conv_cuda.cuh
src/infiniop/ops/conv/cuda/conv_cuda.cuh
+9
-0
src/infiniop/ops/conv/info.h
src/infiniop/ops/conv/info.h
+257
-0
src/infiniop/ops/conv/operator.cc
src/infiniop/ops/conv/operator.cc
+123
-0
test/infiniop/conv.py
test/infiniop/conv.py
+147
-210
test/infiniop/libinfiniop/op_register.py
test/infiniop/libinfiniop/op_register.py
+36
-0
No files found.
include/infiniop/ops/conv.h
View file @
d417f967
...
...
@@ -7,9 +7,10 @@ typedef struct InfiniopDescriptor *infiniopConvDescriptor_t;
__C
__export
infiniStatus_t
infiniopCreateConvDescriptor
(
infiniopHandle_t
handle
,
infiniopConvDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y
,
infiniopTensorDescriptor_t
x
,
infiniopTensorDescriptor_t
w
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
void
*
pads
,
void
*
strides
,
void
*
dilations
,
...
...
@@ -17,7 +18,7 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle
__C
__export
infiniStatus_t
infiniopGetConvWorkspaceSize
(
infiniopConvDescriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopConv
(
infiniopConvDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
void
const
*
x
,
void
const
*
w
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopConv
(
infiniopConvDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyConvDescriptor
(
infiniopConvDescriptor_t
desc
);
...
...
src/infiniop/ops/conv/conv.h
0 → 100644
View file @
d417f967
#ifndef __CONV_H__
#define __CONV_H__
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::conv::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
infiniDtype_t _dtype; \
ConvInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
infiniDtype_t dtype, \
ConvInfo info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dtype(dtype), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y, \
infiniopTensorDescriptor_t x, \
infiniopTensorDescriptor_t w, \
infiniopTensorDescriptor_t b, \
const void *pads, \
const void *strides, \
const void *dilations, \
size_t n); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
const void *w, \
const void *bias, \
void *stream) const; \
}; \
}
#endif // __CONV_H__
src/infiniop/ops/conv/cpu/conv_cpu.cc
0 → 100644
View file @
d417f967
#include "conv_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include <algorithm>
namespace
op
::
conv
::
cpu
{
inline
size_t
calculatePaddedInputSize
(
const
ConvInfo
&
info
)
{
std
::
vector
<
size_t
>
shape
(
info
.
ndim
()
+
2
);
shape
[
0
]
=
info
.
batch
();
shape
[
1
]
=
info
.
in_channels
();
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
shape
[
i
+
2
]
=
info
.
input_dim
(
i
);
}
return
op
::
common_cpu
::
getPaddedSize
(
info
.
ndim
()
+
2
,
shape
.
data
(),
info
.
getPadsInfo
());
}
inline
size_t
calculateOutputSize
(
const
ConvInfo
&
info
)
{
size_t
size
=
info
.
batch
()
*
info
.
out_channels
();
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
size
*=
info
.
output_dim
(
i
);
}
return
size
;
}
inline
bool
needsPadding
(
const
ConvInfo
&
info
)
{
const
size_t
*
pads
=
info
.
getPadsInfo
();
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
if
(
pads
[
i
]
>
0
)
{
return
true
;
}
}
return
false
;
}
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
const
void
*
pads
,
const
void
*
strides
,
const
void
*
dilations
,
size_t
n
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
auto
result
=
ConvInfo
::
create
(
handle_
,
y_desc
,
x_desc
,
w_desc
,
b_desc
,
pads
,
strides
,
dilations
,
n
);
CHECK_RESULT
(
result
);
size_t
WorkSpaceSize
=
0
;
const
ConvInfo
&
info
=
result
.
take
();
if
(
needsPadding
(
info
))
{
WorkSpaceSize
+=
calculatePaddedInputSize
(
info
)
*
infiniSizeOf
(
dtype
);
}
if
(
dtype
==
INFINI_DTYPE_F16
||
dtype
==
INFINI_DTYPE_BF16
)
{
WorkSpaceSize
+=
calculateOutputSize
(
info
)
*
sizeof
(
float
);
}
*
desc_ptr
=
new
Descriptor
(
dtype
,
std
::
move
(
info
),
WorkSpaceSize
,
nullptr
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
typename
Tdata
>
void
fillPaddedInput
(
const
ConvInfo
&
info
,
const
Tdata
*
x
,
const
size_t
*
padded_x_shape
,
Tdata
*
padded_x
,
size_t
x_index
,
size_t
padded_x_index
,
size_t
ndim
)
{
size_t
x_shape_val
;
if
(
ndim
==
0
)
{
x_shape_val
=
info
.
batch
();
}
else
if
(
ndim
==
1
)
{
x_shape_val
=
info
.
in_channels
();
}
else
{
x_shape_val
=
info
.
input_dim
(
ndim
-
2
);
}
const
auto
padded_x_shape_val
=
padded_x_shape
[
ndim
];
const
auto
x_base_index
=
x_index
*
x_shape_val
;
size_t
pad_offset
=
0
;
if
(
ndim
>=
2
&&
x_shape_val
!=
padded_x_shape_val
)
{
pad_offset
=
info
.
pad_info
(
ndim
-
2
);
}
const
auto
padded_x_base_index
=
padded_x_index
*
padded_x_shape_val
+
pad_offset
;
for
(
size_t
i
=
0
;
i
<
x_shape_val
;
++
i
)
{
if
(
ndim
==
info
.
ndim
()
+
2
-
1
)
{
padded_x
[
padded_x_base_index
+
i
]
=
x
[
x_base_index
+
i
];
}
else
{
fillPaddedInput
(
info
,
x
,
padded_x_shape
,
padded_x
,
x_base_index
+
i
,
padded_x_base_index
+
i
,
ndim
+
1
);
}
}
}
template
<
typename
Xdata
,
typename
Ydata
>
void
_applyConv
(
const
ConvInfo
&
info
,
Ydata
*
y
,
const
Xdata
*
x
,
const
Xdata
*
w
,
const
size_t
*
x_shape
,
size_t
x_index
,
size_t
w_index
,
size_t
y_index
,
size_t
ndim
)
{
size_t
dim_size
,
kernel_size
;
size_t
dilation
,
stride
;
if
(
ndim
<
2
)
{
return
;
}
else
{
dim_size
=
x_shape
[
ndim
];
kernel_size
=
info
.
kernel_dim
(
ndim
-
2
);
dilation
=
info
.
dilation_info
(
ndim
-
2
);
stride
=
info
.
stride_info
(
ndim
-
2
);
}
if
(
stride
==
0
)
{
std
::
cerr
<<
"Error: stride cannot be zero."
<<
std
::
endl
;
}
const
auto
steps
=
(
dim_size
-
dilation
*
(
kernel_size
-
1
)
-
1
)
/
stride
+
1
;
x_index
*=
dim_size
;
w_index
*=
kernel_size
;
size_t
y_stride
;
if
(
ndim
==
0
)
{
y_stride
=
info
.
out_channels
();
}
else
if
(
ndim
==
1
)
{
y_stride
=
1
;
}
else
{
y_stride
=
info
.
output_dim
(
ndim
-
2
);
}
y_index
*=
y_stride
;
for
(
size_t
i
=
0
;
i
<
steps
;
++
i
,
++
y_index
)
{
for
(
size_t
k
=
0
;
k
<
kernel_size
;
++
k
)
{
const
auto
curr_x_index
=
x_index
+
i
*
stride
+
k
*
dilation
;
const
auto
curr_w_index
=
w_index
+
k
;
if
(
ndim
==
info
.
ndim
()
+
1
)
{
if
constexpr
(
std
::
is_same
<
Xdata
,
fp16_t
>::
value
||
std
::
is_same
<
Xdata
,
bf16_t
>::
value
)
{
y
[
y_index
]
+=
utils
::
cast
<
float
>
(
x
[
curr_x_index
])
*
utils
::
cast
<
float
>
(
w
[
curr_w_index
]);
}
else
{
y
[
y_index
]
+=
x
[
curr_x_index
]
*
w
[
curr_w_index
];
}
}
else
{
_applyConv
(
info
,
y
,
x
,
w
,
x_shape
,
curr_x_index
,
curr_w_index
,
y_index
,
ndim
+
1
);
}
}
}
}
template
<
typename
Xdata
,
typename
Ydata
>
void
applyConv
(
const
ConvInfo
&
info
,
Ydata
*
y
,
const
Xdata
*
x
,
const
Xdata
*
w
,
const
size_t
*
x_shape
)
{
const
ptrdiff_t
batch_size
=
static_cast
<
ptrdiff_t
>
(
info
.
batch
());
const
ptrdiff_t
out_channels
=
static_cast
<
ptrdiff_t
>
(
info
.
out_channels
());
const
ptrdiff_t
total_iterations
=
batch_size
*
out_channels
;
#pragma omp parallel for schedule(dynamic)
for
(
ptrdiff_t
iter
=
0
;
iter
<
total_iterations
;
++
iter
)
{
const
ptrdiff_t
i
=
iter
/
out_channels
;
// batch index
const
ptrdiff_t
j
=
iter
%
out_channels
;
// output channel index
const
size_t
y_index
=
static_cast
<
size_t
>
(
i
)
*
info
.
out_channels
()
+
static_cast
<
size_t
>
(
j
);
// 内层循环:遍历输入通道
for
(
size_t
k
=
0
;
k
<
info
.
in_channels
();
++
k
)
{
const
size_t
x_index
=
static_cast
<
size_t
>
(
i
)
*
info
.
in_channels
()
+
k
;
const
size_t
w_index
=
static_cast
<
size_t
>
(
j
)
*
info
.
in_channels
()
+
k
;
_applyConv
(
info
,
y
,
x
,
w
,
x_shape
,
x_index
,
w_index
,
y_index
,
2
);
}
}
}
template
<
typename
Xdata
,
typename
Ydata
>
void
_conv_cpu
(
const
ConvInfo
&
info
,
void
*
workspace
,
size_t
workspace_size
,
Ydata
*
y
,
const
Xdata
*
x
,
const
Xdata
*
w
)
{
if
(
needsPadding
(
info
))
{
auto
padded_x
=
reinterpret_cast
<
Xdata
*>
(
workspace
);
if
constexpr
(
std
::
is_same
<
Xdata
,
fp16_t
>::
value
)
{
fp16_t
zero_val
=
utils
::
cast
<
fp16_t
>
(
0.0
f
);
std
::
fill
(
padded_x
,
padded_x
+
calculatePaddedInputSize
(
info
),
zero_val
);
}
else
if
constexpr
(
std
::
is_same
<
Xdata
,
bf16_t
>::
value
)
{
bf16_t
zero_val
=
utils
::
cast
<
bf16_t
>
(
0.0
f
);
std
::
fill
(
padded_x
,
padded_x
+
calculatePaddedInputSize
(
info
),
zero_val
);
}
else
if
constexpr
(
std
::
is_same
<
Xdata
,
float
>::
value
)
{
std
::
fill
(
padded_x
,
padded_x
+
calculatePaddedInputSize
(
info
),
0.0
f
);
}
fillPaddedInput
(
info
,
x
,
info
.
getPaddedShape
(),
padded_x
,
0
,
0
,
0
);
applyConv
(
info
,
y
,
padded_x
,
w
,
info
.
getPaddedShape
());
}
else
{
std
::
vector
<
size_t
>
shape
(
info
.
ndim
()
+
2
);
shape
[
0
]
=
info
.
batch
();
shape
[
1
]
=
info
.
in_channels
();
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
shape
[
i
+
2
]
=
info
.
input_dim
(
i
);
}
applyConv
(
info
,
y
,
x
,
w
,
shape
.
data
());
}
}
template
<
typename
Tdata
>
infiniStatus_t
conv_cpu
(
const
ConvInfo
&
info
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
)
{
auto
y_ptr
=
reinterpret_cast
<
Tdata
*>
(
y
);
auto
x_ptr
=
reinterpret_cast
<
const
Tdata
*>
(
x
);
auto
w_ptr
=
reinterpret_cast
<
const
Tdata
*>
(
w
);
auto
output_size
=
calculateOutputSize
(
info
);
if
constexpr
(
std
::
is_same
<
Tdata
,
float
>::
value
)
{
std
::
fill
(
y_ptr
,
y_ptr
+
output_size
,
0.0
f
);
}
else
{
std
::
fill
(
y_ptr
,
y_ptr
+
output_size
,
static_cast
<
Tdata
>
(
0
));
}
_conv_cpu
<
Tdata
,
Tdata
>
(
info
,
workspace
,
workspace_size
,
y_ptr
,
x_ptr
,
w_ptr
);
if
(
bias
!=
nullptr
)
{
auto
bias_ptr
=
reinterpret_cast
<
const
Tdata
*>
(
bias
);
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
size_t
channel_idx
=
(
i
/
info
.
spatial_sizes
())
%
info
.
out_channels
();
y_ptr
[
i
]
+=
bias_ptr
[
channel_idx
];
}
}
return
INFINI_STATUS_SUCCESS
;
}
template
<
>
infiniStatus_t
conv_cpu
<
fp16_t
>
(
const
ConvInfo
&
info
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
)
{
auto
y_float
=
reinterpret_cast
<
float
*>
(
workspace
);
auto
x_half
=
reinterpret_cast
<
const
fp16_t
*>
(
x
);
auto
w_half
=
reinterpret_cast
<
const
fp16_t
*>
(
w
);
auto
output_size
=
calculateOutputSize
(
info
);
std
::
fill
(
y_float
,
y_float
+
output_size
,
0.0
f
);
void
*
conv_workspace
=
y_float
+
output_size
;
size_t
conv_workspace_size
=
workspace_size
-
output_size
*
sizeof
(
float
);
_conv_cpu
<
fp16_t
,
float
>
(
info
,
conv_workspace
,
conv_workspace_size
,
y_float
,
x_half
,
w_half
);
auto
y_half
=
reinterpret_cast
<
fp16_t
*>
(
y
);
if
(
bias
!=
nullptr
)
{
auto
bias_half
=
reinterpret_cast
<
const
fp16_t
*>
(
bias
);
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
size_t
channel_idx
=
(
i
/
info
.
spatial_sizes
())
%
info
.
out_channels
();
float
bias_value
=
utils
::
cast
<
float
>
(
bias_half
[
channel_idx
]);
y_float
[
i
]
+=
bias_value
;
y_half
[
i
]
=
utils
::
cast
<
fp16_t
>
(
y_float
[
i
]);
}
}
else
{
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
y_half
[
i
]
=
utils
::
cast
<
fp16_t
>
(
y_float
[
i
]);
}
}
return
INFINI_STATUS_SUCCESS
;
}
template
<
>
infiniStatus_t
conv_cpu
<
bf16_t
>
(
const
ConvInfo
&
info
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
)
{
auto
y_float
=
reinterpret_cast
<
float
*>
(
workspace
);
auto
x_half
=
reinterpret_cast
<
const
bf16_t
*>
(
x
);
auto
w_half
=
reinterpret_cast
<
const
bf16_t
*>
(
w
);
auto
output_size
=
calculateOutputSize
(
info
);
std
::
fill
(
y_float
,
y_float
+
output_size
,
0.0
f
);
void
*
conv_workspace
=
y_float
+
output_size
;
size_t
conv_workspace_size
=
workspace_size
-
output_size
*
sizeof
(
float
);
_conv_cpu
<
bf16_t
,
float
>
(
info
,
conv_workspace
,
conv_workspace_size
,
y_float
,
x_half
,
w_half
);
auto
y_half
=
reinterpret_cast
<
bf16_t
*>
(
y
);
if
(
bias
!=
nullptr
)
{
auto
bias_half
=
reinterpret_cast
<
const
bf16_t
*>
(
bias
);
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
size_t
channel_idx
=
(
i
/
info
.
spatial_sizes
())
%
info
.
out_channels
();
float
bias_value
=
utils
::
cast
<
float
>
(
bias_half
[
channel_idx
]);
y_float
[
i
]
+=
bias_value
;
y_half
[
i
]
=
utils
::
cast
<
bf16_t
>
(
y_float
[
i
]);
}
}
else
{
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
y_half
[
i
]
=
utils
::
cast
<
bf16_t
>
(
y_float
[
i
]);
}
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
conv_cpu
<
fp16_t
>
(
_info
,
workspace
,
workspace_size
,
y
,
x
,
w
,
bias
);
case
INFINI_DTYPE_F32
:
return
conv_cpu
<
float
>
(
_info
,
workspace
,
workspace_size
,
y
,
x
,
w
,
bias
);
case
INFINI_DTYPE_BF16
:
return
conv_cpu
<
bf16_t
>
(
_info
,
workspace
,
workspace_size
,
y
,
x
,
w
,
bias
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
}
// namespace op::conv::cpu
src/infiniop/ops/conv/cpu/conv_cpu.h
0 → 100644
View file @
d417f967
#ifndef __CONV_CPU_H__
#define __CONV_CPU_H__
#include "../conv.h"
DESCRIPTOR
(
cpu
)
#endif // __CONV_CPU_H__
src/infiniop/ops/conv/cuda/conv_cuda.cu
0 → 100644
View file @
d417f967
#include "../../../devices/cuda/cuda_handle.cuh"
#include "conv_cuda.cuh"
#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
do { \
if (desc_ptr) { \
destroy_func(desc_ptr); \
desc_ptr = nullptr; \
} \
} while (0)
#define CLEANUP_CUDNN_DESCRIPTORS() \
do { \
DESTROY_CUDNN_DESCRIPTOR(x_desc, cudnnDestroyTensorDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(y_desc, cudnnDestroyTensorDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(w_desc, cudnnDestroyFilterDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(b_desc, cudnnDestroyTensorDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(act_desc, cudnnDestroyActivationDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \
} while (0)
namespace
op
::
conv
::
cuda
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
cudnnTensorDescriptor_t
x_desc
=
nullptr
;
cudnnTensorDescriptor_t
y_desc
=
nullptr
;
cudnnFilterDescriptor_t
w_desc
=
nullptr
;
cudnnTensorDescriptor_t
b_desc
=
nullptr
;
cudnnActivationDescriptor_t
act_desc
=
nullptr
;
cudnnConvolutionDescriptor_t
conv_desc
=
nullptr
;
cudnnConvolutionFwdAlgo_t
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
size_t
workspace_size
=
0
;
private:
Opaque
(
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal_ptr
)
:
internal
(
internal_ptr
)
{}
void
initializeDimensionArrays
(
const
ConvInfo
&
info
,
std
::
vector
<
int
>
&
input_dims
,
std
::
vector
<
int
>
&
output_dims
,
std
::
vector
<
int
>
&
filter_dims
,
std
::
vector
<
int
>
&
input_strides
,
std
::
vector
<
int
>
&
output_strides
)
const
{
bool
is_1d_conv
=
(
info
.
ndim
()
==
1
);
int
actual_tensor_ndim
=
is_1d_conv
?
4
:
static_cast
<
int
>
(
info
.
ndim
()
+
2
);
input_dims
[
0
]
=
static_cast
<
int
>
(
info
.
batch
());
input_dims
[
1
]
=
static_cast
<
int
>
(
info
.
in_channels
());
output_dims
[
0
]
=
static_cast
<
int
>
(
info
.
batch
());
output_dims
[
1
]
=
static_cast
<
int
>
(
info
.
out_channels
());
filter_dims
[
0
]
=
static_cast
<
int
>
(
info
.
out_channels
());
filter_dims
[
1
]
=
static_cast
<
int
>
(
info
.
in_channels
());
if
(
is_1d_conv
)
{
input_dims
[
2
]
=
1
;
input_dims
[
3
]
=
static_cast
<
int
>
(
info
.
input_dim
(
0
));
output_dims
[
2
]
=
1
;
output_dims
[
3
]
=
static_cast
<
int
>
(
info
.
output_dim
(
0
));
filter_dims
[
2
]
=
1
;
filter_dims
[
3
]
=
static_cast
<
int
>
(
info
.
kernel_dim
(
0
));
}
else
{
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
input_dims
[
i
+
2
]
=
static_cast
<
int
>
(
info
.
input_dim
(
i
));
output_dims
[
i
+
2
]
=
static_cast
<
int
>
(
info
.
output_dim
(
i
));
filter_dims
[
i
+
2
]
=
static_cast
<
int
>
(
info
.
kernel_dim
(
i
));
}
}
calculateStrides
(
input_dims
,
input_strides
,
actual_tensor_ndim
);
calculateStrides
(
output_dims
,
output_strides
,
actual_tensor_ndim
);
}
void
initializeConvolutionParams
(
const
ConvInfo
&
info
,
std
::
vector
<
int
>
&
pads
,
std
::
vector
<
int
>
&
strides
,
std
::
vector
<
int
>
&
dilations
)
const
{
bool
is_1d_conv
=
(
info
.
ndim
()
==
1
);
if
(
is_1d_conv
)
{
pads
[
0
]
=
0
;
pads
[
1
]
=
static_cast
<
int
>
(
info
.
pad_info
(
0
));
strides
[
0
]
=
1
;
strides
[
1
]
=
static_cast
<
int
>
(
info
.
stride_info
(
0
));
dilations
[
0
]
=
1
;
dilations
[
1
]
=
static_cast
<
int
>
(
info
.
dilation_info
(
0
));
}
else
{
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
pads
[
i
]
=
static_cast
<
int
>
(
info
.
pad_info
(
i
));
strides
[
i
]
=
static_cast
<
int
>
(
info
.
stride_info
(
i
));
dilations
[
i
]
=
static_cast
<
int
>
(
info
.
dilation_info
(
i
));
}
}
}
void
calculateStrides
(
const
std
::
vector
<
int
>
&
dims
,
std
::
vector
<
int
>
&
strides
,
int
ndim
)
const
{
strides
[
ndim
-
1
]
=
1
;
for
(
int
d
=
ndim
-
2
;
d
>=
0
;
--
d
)
{
strides
[
d
]
=
strides
[
d
+
1
]
*
dims
[
d
+
1
];
}
}
infiniStatus_t
getCudnnDataType
(
infiniDtype_t
data_type
,
cudnnDataType_t
&
cudnn_data_type
)
const
{
if
(
data_type
==
INFINI_DTYPE_F16
)
{
cudnn_data_type
=
device
::
cuda
::
getCudnnDtype
(
data_type
);
}
else
if
(
data_type
==
INFINI_DTYPE_F32
)
{
cudnn_data_type
=
device
::
cuda
::
getCudnnDtype
(
data_type
);
}
else
if
(
data_type
==
INFINI_DTYPE_BF16
)
{
cudnn_data_type
=
device
::
cuda
::
getCudnnDtype
(
data_type
);
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
createBasicDescriptors
(
const
std
::
vector
<
int
>
&
input_dims
,
const
std
::
vector
<
int
>
&
output_dims
,
const
std
::
vector
<
int
>
&
filter_dims
,
cudnnDataType_t
cudnn_data_type
,
int
actual_tensor_ndim
)
{
CHECK_CUDNN
(
cudnnCreateTensorDescriptor
(
&
x_desc
));
CHECK_CUDNN
(
cudnnCreateTensorDescriptor
(
&
y_desc
));
CHECK_CUDNN
(
cudnnCreateFilterDescriptor
(
&
w_desc
));
CHECK_CUDNN
(
cudnnCreateConvolutionDescriptor
(
&
conv_desc
));
CHECK_CUDNN
(
cudnnSetTensorNdDescriptorEx
(
x_desc
,
CUDNN_TENSOR_NCHW
,
cudnn_data_type
,
actual_tensor_ndim
,
input_dims
.
data
()));
CHECK_CUDNN
(
cudnnSetTensorNdDescriptorEx
(
y_desc
,
CUDNN_TENSOR_NCHW
,
cudnn_data_type
,
actual_tensor_ndim
,
output_dims
.
data
()));
CHECK_CUDNN
(
cudnnSetFilterNdDescriptor
(
w_desc
,
cudnn_data_type
,
CUDNN_TENSOR_NCHW
,
actual_tensor_ndim
,
filter_dims
.
data
()));
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
createBiasDescriptors
(
const
ConvInfo
&
info
,
cudnnDataType_t
cudnn_data_type
,
int
actual_tensor_ndim
)
{
if
(
info
.
bias_dims_size
()
==
0
)
{
b_desc
=
nullptr
;
act_desc
=
nullptr
;
return
INFINI_STATUS_SUCCESS
;
}
std
::
vector
<
int
>
bias_dims_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
bias_strides_arr
(
actual_tensor_ndim
);
bias_dims_arr
[
0
]
=
1
;
bias_dims_arr
[
1
]
=
static_cast
<
int
>
(
info
.
out_channels
());
for
(
int
i
=
2
;
i
<
actual_tensor_ndim
;
++
i
)
{
bias_dims_arr
[
i
]
=
1
;
}
if
(
actual_tensor_ndim
==
4
)
{
bias_strides_arr
[
0
]
=
static_cast
<
int
>
(
info
.
out_channels
());
bias_strides_arr
[
1
]
=
1
;
bias_strides_arr
[
2
]
=
1
;
bias_strides_arr
[
3
]
=
1
;
}
else
{
calculateStrides
(
bias_dims_arr
,
bias_strides_arr
,
actual_tensor_ndim
);
}
CHECK_CUDNN
(
cudnnCreateTensorDescriptor
(
&
b_desc
));
CHECK_CUDNN
(
cudnnSetTensorNdDescriptor
(
b_desc
,
cudnn_data_type
,
bias_dims_arr
.
size
(),
bias_dims_arr
.
data
(),
bias_strides_arr
.
data
()));
CHECK_CUDNN
(
cudnnCreateActivationDescriptor
(
&
act_desc
));
CHECK_CUDNN
(
cudnnSetActivationDescriptor
(
act_desc
,
CUDNN_ACTIVATION_IDENTITY
,
CUDNN_NOT_PROPAGATE_NAN
,
0.0
));
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
setupConvolutionDescriptor
(
const
std
::
vector
<
int
>
&
pads
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
dilations
,
int
spatial_ndim
,
cudnnDataType_t
compute_type
)
{
CHECK_CUDNN
(
cudnnSetConvolutionNdDescriptor
(
conv_desc
,
spatial_ndim
,
pads
.
data
(),
strides
.
data
(),
dilations
.
data
(),
CUDNN_CROSS_CORRELATION
,
compute_type
));
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
setupAlgorithmWithoutBias
()
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
CHECK_STATUS
(
internal
->
useCudnn
(
nullptr
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
x_desc
,
w_desc
,
conv_desc
,
y_desc
,
algo
,
&
workspace_size
));
return
INFINI_STATUS_SUCCESS
;
}));
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
setupAlgorithmWithBias
()
{
int
maxAlgoCount
=
0
;
CHECK_STATUS
(
internal
->
useCudnn
(
nullptr
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnGetConvolutionForwardAlgorithmMaxCount
(
handle
,
&
maxAlgoCount
));
return
INFINI_STATUS_SUCCESS
;
}));
if
(
maxAlgoCount
<=
0
)
{
maxAlgoCount
=
8
;
}
std
::
vector
<
cudnnConvolutionFwdAlgoPerf_t
>
perf_results
(
maxAlgoCount
);
int
algoCounts
=
0
;
CHECK_STATUS
(
internal
->
useCudnn
(
nullptr
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnFindConvolutionForwardAlgorithm
(
handle
,
x_desc
,
w_desc
,
conv_desc
,
y_desc
,
maxAlgoCount
,
&
algoCounts
,
perf_results
.
data
()));
return
INFINI_STATUS_SUCCESS
;
}));
if
(
algoCounts
<
1
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
for
(
int
i
=
0
;
i
<
algoCounts
;
++
i
)
{
CHECK_STATUS
(
internal
->
useCudnn
(
nullptr
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
x_desc
,
w_desc
,
conv_desc
,
y_desc
,
perf_results
[
i
].
algo
,
&
workspace_size
));
return
INFINI_STATUS_SUCCESS
;
}));
algo
=
perf_results
[
i
].
algo
;
break
;
}
return
INFINI_STATUS_SUCCESS
;
}
public:
Opaque
(
Opaque
&&
other
)
noexcept
:
internal
(
std
::
move
(
other
.
internal
)),
x_desc
(
other
.
x_desc
),
y_desc
(
other
.
y_desc
),
w_desc
(
other
.
w_desc
),
b_desc
(
other
.
b_desc
),
act_desc
(
other
.
act_desc
),
conv_desc
(
other
.
conv_desc
),
algo
(
other
.
algo
),
workspace_size
(
other
.
workspace_size
)
{
other
.
x_desc
=
nullptr
;
other
.
y_desc
=
nullptr
;
other
.
w_desc
=
nullptr
;
other
.
b_desc
=
nullptr
;
other
.
act_desc
=
nullptr
;
other
.
conv_desc
=
nullptr
;
other
.
workspace_size
=
0
;
}
~
Opaque
()
{
CLEANUP_CUDNN_DESCRIPTORS
();
}
infiniStatus_t
initializeCudnnContext
(
ConvInfo
&
info
,
infiniDtype_t
data_type
,
cudnnDataType_t
compute_type
)
{
bool
is_1d_conv
=
(
info
.
ndim
()
==
1
);
int
actual_tensor_ndim
=
is_1d_conv
?
4
:
static_cast
<
int
>
(
info
.
ndim
()
+
2
);
int
spatial_ndim_for_conv_desc
=
is_1d_conv
?
2
:
static_cast
<
int
>
(
info
.
ndim
());
std
::
vector
<
int
>
input_dims_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
output_dims_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
filter_dims_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
input_strides_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
output_strides_arr
(
actual_tensor_ndim
);
initializeDimensionArrays
(
info
,
input_dims_arr
,
output_dims_arr
,
filter_dims_arr
,
input_strides_arr
,
output_strides_arr
);
std
::
vector
<
int
>
pads_arr
(
spatial_ndim_for_conv_desc
);
std
::
vector
<
int
>
strides_arr
(
spatial_ndim_for_conv_desc
);
std
::
vector
<
int
>
dilations_arr
(
spatial_ndim_for_conv_desc
);
initializeConvolutionParams
(
info
,
pads_arr
,
strides_arr
,
dilations_arr
);
cudnnDataType_t
cudnn_data_type
;
CHECK_STATUS
(
getCudnnDataType
(
data_type
,
cudnn_data_type
));
CHECK_STATUS
(
createBasicDescriptors
(
input_dims_arr
,
output_dims_arr
,
filter_dims_arr
,
cudnn_data_type
,
actual_tensor_ndim
));
CHECK_STATUS
(
createBiasDescriptors
(
info
,
cudnn_data_type
,
actual_tensor_ndim
));
CHECK_STATUS
(
setupConvolutionDescriptor
(
pads_arr
,
strides_arr
,
dilations_arr
,
spatial_ndim_for_conv_desc
,
compute_type
));
if
(
info
.
bias_dims_size
()
==
0
)
{
CHECK_STATUS
(
setupAlgorithmWithoutBias
());
}
else
{
CHECK_STATUS
(
setupAlgorithmWithBias
());
}
return
INFINI_STATUS_SUCCESS
;
}
static
inline
utils
::
Result
<
Opaque
>
create
(
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal_ptr
,
ConvInfo
&
info
,
infiniDtype_t
data_type
)
{
Opaque
opaque
(
internal_ptr
);
auto
status
=
opaque
.
initializeCudnnContext
(
info
,
data_type
,
CUDNN_DATA_FLOAT
);
if
(
status
!=
INFINI_STATUS_SUCCESS
)
{
return
status
;
}
return
utils
::
Result
<
Opaque
>
(
std
::
move
(
opaque
));
}
};
Descriptor
::~
Descriptor
()
{
if
(
_opaque
)
{
delete
_opaque
;
}
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
const
void
*
pads
,
const
void
*
strides
,
const
void
*
dilations
,
size_t
n
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
nvidia
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
auto
result
=
ConvInfo
::
create
(
handle_
,
y_desc
,
x_desc
,
w_desc
,
b_desc
,
pads
,
strides
,
dilations
,
n
);
CHECK_RESULT
(
result
);
auto
conv_info
=
result
.
take
();
auto
opaque_result
=
Opaque
::
create
(
handle
->
internal
(),
conv_info
,
dtype
);
CHECK_RESULT
(
opaque_result
);
auto
opaque
=
new
Opaque
(
std
::
move
(
opaque_result
.
take
()));
*
desc_ptr
=
new
Descriptor
(
dtype
,
std
::
move
(
conv_info
),
opaque
->
workspace_size
,
opaque
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
,
void
*
stream
)
const
{
const
float
alpha
=
1.0
f
,
beta
=
0.0
f
;
if
(
bias
!=
nullptr
)
{
CHECK_STATUS
(
_opaque
->
internal
->
useCudnn
(
(
cudaStream_t
)
stream
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnConvolutionBiasActivationForward
(
handle
,
&
alpha
,
_opaque
->
x_desc
,
x
,
_opaque
->
w_desc
,
w
,
_opaque
->
conv_desc
,
_opaque
->
algo
,
workspace
,
workspace_size
,
&
beta
,
_opaque
->
y_desc
,
y
,
_opaque
->
b_desc
,
bias
,
_opaque
->
act_desc
,
_opaque
->
y_desc
,
y
));
return
INFINI_STATUS_SUCCESS
;
}));
}
else
{
CHECK_STATUS
(
_opaque
->
internal
->
useCudnn
(
(
cudaStream_t
)
stream
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnConvolutionForward
(
handle
,
&
alpha
,
_opaque
->
x_desc
,
x
,
_opaque
->
w_desc
,
w
,
_opaque
->
conv_desc
,
_opaque
->
algo
,
workspace
,
workspace_size
,
&
beta
,
_opaque
->
y_desc
,
y
));
return
INFINI_STATUS_SUCCESS
;
}));
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::conv::cuda
src/infiniop/ops/conv/cuda/conv_cuda.cuh
0 → 100644
View file @
d417f967
#ifndef __CONV_CUDA_CUH__
#define __CONV_CUDA_CUH__
#include "../../../devices/cuda/cuda_common.cuh"
#include "../conv.h"
DESCRIPTOR
(
cuda
)
#endif // __GEMM_CUDA_CUH__
src/infiniop/ops/conv/info.h
0 → 100644
View file @
d417f967
#ifndef __CONV_INFO_H__
#define __CONV_INFO_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#ifdef ENABLE_CUDA_API
#include "../../devices/cuda/cuda_handle.cuh"
#endif
namespace
op
::
conv
{
class
ConvInfo
;
}
// namespace op::conv
namespace
op
::
conv
{
class
ConvInfo
{
private:
std
::
vector
<
size_t
>
_meta
;
size_t
_ndim
;
size_t
_batch
;
size_t
_in_channels
;
size_t
_out_channels
;
size_t
_spatial_sizes
;
size_t
_bias_dims_size
;
size_t
_padded_shape_size
;
ConvInfo
(
std
::
vector
<
size_t
>
meta
,
size_t
ndim
,
size_t
batch
,
size_t
in_channels
,
size_t
out_channels
,
size_t
spatial_sizes
,
size_t
bias_dims_size
,
size_t
padded_shape_size
)
:
_meta
(
std
::
move
(
meta
)),
_ndim
(
ndim
),
_batch
(
batch
),
_in_channels
(
in_channels
),
_out_channels
(
out_channels
),
_spatial_sizes
(
spatial_sizes
),
_bias_dims_size
(
bias_dims_size
),
_padded_shape_size
(
padded_shape_size
)
{}
public:
inline
size_t
ndim
()
const
{
return
_ndim
;
}
inline
size_t
batch
()
const
{
return
_batch
;
}
inline
size_t
in_channels
()
const
{
return
_in_channels
;
}
inline
size_t
out_channels
()
const
{
return
_out_channels
;
}
inline
size_t
spatial_sizes
()
const
{
return
_spatial_sizes
;
}
inline
size_t
bias_dims_size
()
const
{
return
_bias_dims_size
;
}
inline
size_t
padded_shape_size
()
const
{
return
_padded_shape_size
;
}
inline
size_t
getMetaMemSize
()
const
{
return
_meta
.
size
()
*
sizeof
(
size_t
);
}
inline
const
int8_t
*
getMetaStart
()
const
{
return
reinterpret_cast
<
const
int8_t
*>
(
_meta
.
data
());
}
inline
const
size_t
*
getInputDims
()
const
{
return
_meta
.
data
();
}
inline
const
size_t
*
getKernelDims
()
const
{
return
getInputDims
()
+
_ndim
;
}
inline
const
size_t
*
getOutputDims
()
const
{
return
getKernelDims
()
+
_ndim
;
}
inline
const
size_t
*
getBiasDims
()
const
{
return
getOutputDims
()
+
_ndim
;
}
inline
const
size_t
*
getPadsInfo
()
const
{
return
getBiasDims
()
+
_bias_dims_size
;
}
inline
const
ptrdiff_t
*
getStridesInfo
()
const
{
return
reinterpret_cast
<
const
ptrdiff_t
*>
(
getPadsInfo
())
+
_ndim
;
}
inline
const
size_t
*
getDilationsInfo
()
const
{
return
reinterpret_cast
<
const
size_t
*>
(
getStridesInfo
())
+
_ndim
;
}
inline
const
size_t
*
getPaddedShape
()
const
{
return
getDilationsInfo
()
+
_ndim
;
}
inline
size_t
input_dim
(
size_t
i
)
const
{
return
i
<
_ndim
?
getInputDims
()[
i
]
:
0
;
}
inline
size_t
kernel_dim
(
size_t
i
)
const
{
return
i
<
_ndim
?
getKernelDims
()[
i
]
:
0
;
}
inline
size_t
output_dim
(
size_t
i
)
const
{
return
i
<
_ndim
?
getOutputDims
()[
i
]
:
0
;
}
inline
size_t
bias_dim
(
size_t
i
)
const
{
return
i
<
_bias_dims_size
?
getBiasDims
()[
i
]
:
0
;
}
inline
size_t
pad_info
(
size_t
i
)
const
{
return
i
<
_ndim
?
getPadsInfo
()[
i
]
:
0
;
}
inline
ptrdiff_t
stride_info
(
size_t
i
)
const
{
return
i
<
_ndim
?
getStridesInfo
()[
i
]
:
0
;
}
inline
size_t
dilation_info
(
size_t
i
)
const
{
return
i
<
_ndim
?
getDilationsInfo
()[
i
]
:
0
;
}
inline
size_t
padded_shape_dim
(
size_t
i
)
const
{
return
i
<
_padded_shape_size
?
getPaddedShape
()[
i
]
:
0
;
}
static
utils
::
Result
<
ConvInfo
>
create
(
infiniopHandle_t
handle_
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
const
void
*
pads
,
const
void
*
strides
,
const
void
*
dilations
,
size_t
n
);
};
inline
utils
::
Result
<
size_t
>
calculateConvOutputSize
(
size_t
input_size
,
size_t
kernel_size
,
size_t
padding
,
size_t
stride
,
size_t
dilation
)
{
if
(
stride
==
0
)
{
return
utils
::
Result
<
size_t
>
(
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
if
(
dilation
==
0
)
{
return
utils
::
Result
<
size_t
>
(
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
if
(
kernel_size
==
0
)
{
return
utils
::
Result
<
size_t
>
(
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
size_t
effective_kernel
=
dilation
*
(
kernel_size
-
1
)
+
1
;
size_t
padded_input
=
input_size
+
2
*
padding
;
if
(
padded_input
<
effective_kernel
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
size_t
output_size
=
(
padded_input
-
effective_kernel
)
/
stride
+
1
;
return
utils
::
Result
<
size_t
>
(
output_size
);
}
inline
utils
::
Result
<
ConvInfo
>
ConvInfo
::
create
(
infiniopHandle_t
handle_
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
const
void
*
pads
,
const
void
*
strides
,
const
void
*
dilations
,
size_t
n
)
{
auto
dtype
=
y_desc
->
dtype
();
if
(
dtype
!=
x_desc
->
dtype
()
||
dtype
!=
w_desc
->
dtype
())
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
size_t
ndim
=
n
;
size_t
new_dims
=
n
+
2
;
if
(
x_desc
->
ndim
()
<
new_dims
||
y_desc
->
ndim
()
<
new_dims
||
w_desc
->
ndim
()
<
new_dims
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
size_t
batch
=
x_desc
->
shape
()[
0
];
size_t
in_channels
=
x_desc
->
shape
()[
1
];
size_t
out_channels
=
w_desc
->
shape
()[
0
];
if
(
y_desc
->
shape
()[
0
]
!=
batch
||
y_desc
->
shape
()[
1
]
!=
out_channels
||
w_desc
->
shape
()[
1
]
!=
in_channels
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
size_t
bias_dims_size
=
(
b_desc
!=
nullptr
)
?
x_desc
->
ndim
()
:
0
;
const
size_t
*
pads_ptr
=
reinterpret_cast
<
const
size_t
*>
(
pads
);
bool
has_padding
=
false
;
if
(
pads_ptr
!=
nullptr
)
{
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
pads_ptr
[
i
]
>
0
)
{
has_padding
=
true
;
break
;
}
}
}
size_t
padded_shape_size
=
has_padding
?
(
ndim
+
2
)
:
0
;
// 计算meta总大小
size_t
meta_size
=
ndim
*
6
+
bias_dims_size
+
padded_shape_size
;
std
::
vector
<
size_t
>
meta
(
meta_size
);
size_t
*
input_dims
=
meta
.
data
();
size_t
*
kernel_dims
=
input_dims
+
ndim
;
size_t
*
output_dims
=
kernel_dims
+
ndim
;
size_t
*
bias_dims
=
output_dims
+
ndim
;
size_t
*
pads_info
=
bias_dims
+
bias_dims_size
;
ptrdiff_t
*
strides_info
=
reinterpret_cast
<
ptrdiff_t
*>
(
pads_info
)
+
ndim
;
size_t
*
dilations_info
=
reinterpret_cast
<
size_t
*>
(
strides_info
)
+
ndim
;
size_t
*
padded_shape
=
dilations_info
+
ndim
;
const
ptrdiff_t
*
strides_ptr
=
reinterpret_cast
<
const
ptrdiff_t
*>
(
strides
);
const
size_t
*
dilations_ptr
=
reinterpret_cast
<
const
size_t
*>
(
dilations
);
size_t
spatial_sizes
=
1
;
for
(
size_t
i
=
0
;
i
<
ndim
;
i
++
)
{
input_dims
[
i
]
=
x_desc
->
shape
()[
i
+
2
];
kernel_dims
[
i
]
=
w_desc
->
shape
()[
i
+
2
];
output_dims
[
i
]
=
y_desc
->
shape
()[
i
+
2
];
pads_info
[
i
]
=
pads_ptr
==
nullptr
?
0
:
pads_ptr
[
i
];
strides_info
[
i
]
=
strides_ptr
==
nullptr
?
1
:
strides_ptr
[
i
];
dilations_info
[
i
]
=
dilations_ptr
==
nullptr
?
1
:
dilations_ptr
[
i
];
spatial_sizes
=
spatial_sizes
*
output_dims
[
i
];
auto
output_result
=
calculateConvOutputSize
(
input_dims
[
i
],
kernel_dims
[
i
],
pads_info
[
i
],
strides_info
[
i
],
dilations_info
[
i
]);
CHECK_RESULT
(
output_result
);
size_t
expected_output
=
output_result
.
take
();
if
(
output_dims
[
i
]
!=
expected_output
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
}
if
(
bias_dims_size
>
0
)
{
std
::
fill
(
bias_dims
,
bias_dims
+
bias_dims_size
,
1
);
bias_dims
[
1
]
=
b_desc
->
shape
()[
0
];
}
if
(
padded_shape_size
>
0
)
{
padded_shape
[
0
]
=
batch
;
padded_shape
[
1
]
=
in_channels
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
padded_shape
[
i
+
2
]
=
input_dims
[
i
]
+
2
*
pads_info
[
i
];
}
}
ConvInfo
info
(
std
::
move
(
meta
),
ndim
,
batch
,
in_channels
,
out_channels
,
spatial_sizes
,
bias_dims_size
,
padded_shape_size
);
return
utils
::
Result
<
ConvInfo
>
(
info
);
}
}
// namespace op::conv
#endif // __CONV_INFO_H__
src/infiniop/ops/conv/operator.cc
0 → 100644
View file @
d417f967
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/conv.h"
#ifdef ENABLE_CPU_API
#include "cpu/conv_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/conv_cuda.cuh"
#endif
__C
__export
infiniStatus_t
infiniopCreateConvDescriptor
(
infiniopHandle_t
handle
,
infiniopConvDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
void
*
pads
,
void
*
strides
,
void
*
dilations
,
size_t
n
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::conv::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::conv::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
x_desc, \
w_desc, \
b_desc, \
pads, \
strides, \
dilations, \
n)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetConvWorkspaceSize
(
infiniopConvDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::conv::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
}
__C
infiniStatus_t
infiniopConv
(
infiniopConvDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::conv::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, \
y, \
x, \
w, \
bias, \
stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyConvDescriptor
(
infiniopConvDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::conv::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
test/infiniop/conv.py
View file @
d417f967
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_uint64
,
c_void_p
import
torch
import
ctypes
import
sys
import
os
import
time
from
ctypes
import
c_uint64
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
".."
)))
from
operatorspy
import
(
open_lib
,
to_tensor
,
DeviceEnum
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
create_handle
,
destroy_handle
,
from
libinfiniop
import
(
LIBINFINIOP
,
TestTensor
,
get_test_devices
,
check_error
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
TestWorkspace
,
InfiniDtype
,
InfiniDtypeNames
,
InfiniDeviceNames
,
infiniopOperatorDescriptor_t
,
)
from
operatorspy.tests.test_utils
import
get_args
import
torch
from
enum
import
Enum
,
auto
from
typing
import
List
,
Tuple
import
math
import
ctypes
from
torch.nn
import
functional
as
F
from
typing
import
List
,
Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
...
...
@@ -29,36 +29,91 @@ from typing import List, Tuple
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
_TEST_CASES
=
[
# x_shape, x_stride, w_shape, w_stride, pads, strides, dilations, x_strides
(
(
32
,
3
,
4
),
(
12
,
4
,
1
),
(
32
,
3
,
5
),
(
15
,
5
,
1
),
(
1
,),
(
1
,),
(
1
,),
),
(
(
1
,
3
,
4
,
4
),
(
48
,
16
,
4
,
1
),
(
2
,
3
,
3
,
3
),
(
27
,
9
,
3
,
1
),
(
1
,
1
),
(
1
,
2
),
(
2
,
1
),
),
(
(
32
,
3
,
32
,
32
),
(
32
*
32
*
3
,
32
*
32
,
32
,
1
),
(
64
,
3
,
5
,
5
),
(
75
,
25
,
5
,
1
),
(
2
,
2
),
(
2
,
2
),
(
1
,
1
),
),
(
(
1
,
1
,
4
,
4
,
4
),
(
64
,
64
,
16
,
4
,
1
),
(
1
,
1
,
5
,
5
,
5
),
(
125
,
125
,
25
,
5
,
1
),
(
1
,
1
,
1
),
(
1
,
1
,
1
),
(
1
,
1
,
1
),
),
(
(
32
,
3
,
32
,
32
,
32
),
(
32
*
32
*
32
*
3
,
32
*
32
*
32
,
32
*
32
,
32
,
1
),
(
64
,
3
,
5
,
5
,
5
),
(
375
,
125
,
25
,
5
,
1
),
(
3
,
2
,
2
),
(
4
,
3
,
3
),
(
2
,
2
,
1
),
),
]
class
ConvDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
# Data types used for testing
_TENSOR_DTYPES
=
[
InfiniDtype
.
F16
,
InfiniDtype
.
F32
,
InfiniDtype
.
BF16
]
infiniopConvDescriptor_t
=
POINTER
(
ConvDescriptor
)
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
InfiniDtype
.
F16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
F32
:
{
"atol"
:
1e-6
,
"rtol"
:
1e-6
},
InfiniDtype
.
BF16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-2
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
def
conv
(
x
,
w
,
stride
,
padding
,
dilation
):
def
conv
(
x
,
w
,
stride
,
padding
,
dilation
,
y_tensor
,
bias
=
None
):
match
len
(
x
.
shape
)
-
2
:
case
1
:
return
F
.
conv1d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
y_tensor
.
copy_
(
F
.
conv1d
(
x
,
w
,
bias
=
bias
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
)
case
2
:
return
F
.
conv2d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
y_tensor
.
copy_
(
F
.
conv2d
(
x
,
w
,
bias
=
bias
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
)
case
3
:
return
F
.
conv3d
(
x
,
w
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
y_tensor
.
copy_
(
F
.
conv3d
(
x
,
w
,
bias
=
bias
,
stride
=
stride
,
padding
=
padding
,
dilation
=
dilation
)
)
case
_
:
print
(
"Error: Pytorch -> Unsupported tensor dimension"
)
return
None
# infer the shape of the output given the inputs for a N-ary convolution
def
inferShape
(
def
inferShape
Stride
(
x_shape
:
List
[
int
],
w_shape
:
List
[
int
],
pads
:
List
[
int
],
strides
:
List
[
int
],
dilations
:
List
[
int
],
)
->
Tuple
[
int
,
...]:
)
->
Tuple
[
Tuple
[
int
,
...]
,
Tuple
[
int
,
...]]
:
assert
(
len
(
x_shape
)
==
len
(
w_shape
)
...
...
@@ -74,7 +129,12 @@ def inferShape(
)
for
i
in
range
(
len
(
pads
))
]
return
(
x_shape
[
0
],
w_shape
[
0
])
+
tuple
(
output_dims
)
output_shape
=
(
x_shape
[
0
],
w_shape
[
0
])
+
tuple
(
output_dims
)
output_strides
=
[
1
]
for
s
in
reversed
(
output_shape
[
1
:]):
output_strides
.
insert
(
0
,
output_strides
[
0
]
*
s
)
output_strides
=
tuple
(
output_strides
)
return
output_shape
,
output_strides
# convert a python tuple to a ctype void pointer
...
...
@@ -85,52 +145,43 @@ def tuple_to_void_p(py_tuple: Tuple):
def
test
(
lib
,
handle
,
torch_
device
,
device
,
x_shape
,
x_stride
,
w_shape
,
w_stride
,
pads
,
strides
,
dilations
,
tensor_stride
=
None
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
tensor_dtype
=
InfiniDtype
.
F16
,
sync
=
None
,
):
assert
len
(
pads
)
==
len
(
strides
)
==
len
(
dilations
)
x
=
TestTensor
(
x_shape
,
x_stride
,
dt
=
tensor_dtype
,
device
=
device
,
scale
=
0.01
)
w
=
TestTensor
(
w_shape
,
w_stride
,
dt
=
tensor_dtype
,
device
=
device
,
scale
=
0.01
)
y_shape
,
y_stride
=
inferShapeStride
(
x_shape
,
w_shape
,
pads
,
strides
,
dilations
)
y
=
TestTensor
(
y_shape
,
y_stride
,
dt
=
tensor_dtype
,
device
=
device
)
b
=
TestTensor
((
w
.
shape
[
0
],),
(
1
,),
dt
=
tensor_dtype
,
device
=
device
,
scale
=
0.01
)
if
w
.
shape
[
0
]
>
1
else
None
print
(
f
"Testing Conv on
{
torch_device
}
with x_shape:
{
x_shape
}
, w_shape:
{
w_shape
}
, b_shape:
{
w_shape
[
0
]
}
, pads:
{
pads
}
, strides:
{
strides
}
, dilations:
{
dilations
}
, x_stride:
{
tensor_stride
}
dtype:
{
tensor_dtype
}
"
f
"Testing Conv on
{
InfiniDeviceNames
[
device
]
}
with x_shape:
{
x_shape
}
, w_shape:
{
w_shape
}
, b_shape:
{
w_shape
[
0
]
}
, pads:
{
pads
}
, strides:
{
strides
}
, dilations:
{
dilations
}
, x_stride:
{
x_stride
}
dtype:
{
tensor_dtype
}
"
f
"dtype:
{
InfiniDtypeNames
[
tensor_dtype
]
}
"
)
x
=
torch
.
rand
(
x_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
w
=
torch
.
rand
(
w_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
y
=
torch
.
zeros
(
inferShape
(
x
.
shape
,
w
.
shape
,
pads
,
strides
,
dilations
),
dtype
=
tensor_dtype
).
to
(
torch_device
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
ans
=
conv
(
x
,
w
,
strides
,
pads
,
dilations
)
if
PROFILE
:
start_time
=
time
.
time
()
for
i
in
range
(
NUM_ITERATIONS
):
_
=
conv
(
x
,
w
,
strides
,
pads
,
dilations
)
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
print
(
f
"pytorch time:
{
elapsed
:
6
f
}
"
)
x_tensor
=
to_tensor
(
x
,
lib
)
w_tensor
=
to_tensor
(
w
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
conv
(
x
.
torch_tensor
(),
w
.
torch_tensor
(),
strides
,
pads
,
dilations
,
y
.
torch_tensor
(),
b
.
torch_tensor
()
if
b
is
not
None
else
None
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniop
Conv
Descriptor_t
()
descriptor
=
infiniop
Operator
Descriptor_t
()
check_error
(
lib
.
infiniopCreateConvDescriptor
(
LIBINFINIOP
.
infiniopCreateConvDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
y_tensor
.
descriptor
,
x_tensor
.
descriptor
,
w_tensor
.
descriptor
,
y
.
descriptor
,
x
.
descriptor
,
w
.
descriptor
,
b
.
descriptor
if
b
is
not
None
else
None
,
tuple_to_void_p
(
pads
),
tuple_to_void_p
(
strides
),
tuple_to_void_p
(
dilations
),
...
...
@@ -139,169 +190,55 @@ def test(
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor
.
descriptor
.
contents
.
invalidate
()
w_tensor
.
descriptor
.
contents
.
invalidate
()
y_
tensor
.
des
criptor
.
contents
.
invalidate
()
for
tensor
in
[
x
,
y
,
w
,
b
]:
if
tensor
is
not
None
:
tensor
.
des
troy_desc
()
workspace
S
ize
=
ctypes
.
c_uint64
(
0
)
workspace
_s
ize
=
ctypes
.
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetConvWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspaceSize
))
)
workspace
=
torch
.
zeros
(
int
(
workspaceSize
.
value
),
dtype
=
torch
.
uint8
).
to
(
torch_device
LIBINFINIOP
.
infiniopGetConvWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
_ptr
=
ctypes
.
cast
(
workspace
.
data_ptr
(),
ctypes
.
POINTER
(
ctypes
.
c_uint8
)
)
workspace
=
TestWorkspace
(
workspace_size
.
value
,
y
.
device
)
for
i
in
range
(
NUM_PRERUN
if
PROFILE
else
1
):
def
lib_conv
(
):
check_error
(
lib
.
infiniopConv
(
LIBINFINIOP
.
infiniopConv
(
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
w_tensor
.
data
,
workspace
.
data
(),
workspace_size
.
value
,
y
.
data
(),
x
.
data
(),
w
.
data
(),
b
.
data
()
if
b
is
not
None
else
None
,
None
,
)
)
if
PROFILE
:
start_time
=
time
.
time
()
for
i
in
range
(
NUM_ITERATIONS
):
check_error
(
lib
.
infiniopConv
(
descriptor
,
workspace_ptr
,
workspaceSize
,
y_tensor
.
data
,
x_tensor
.
data
,
w_tensor
.
data
,
None
,
)
)
elapsed
=
(
time
.
time
()
-
start_time
)
/
NUM_ITERATIONS
print
(
f
" lib time:
{
elapsed
:
6
f
}
"
)
if
tensor_dtype
==
torch
.
float16
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-2
)
else
:
assert
torch
.
allclose
(
y
,
ans
,
atol
=
0
,
rtol
=
1e-3
)
check_error
(
lib
.
infiniopDestroyConvDescriptor
(
descriptor
))
lib_conv
()
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
tensor_dtype
)
if
DEBUG
:
debug
(
y
.
actual_tensor
(),
y
.
torch_tensor
(),
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
y
.
actual_tensor
(),
y
.
torch_tensor
(),
atol
=
atol
,
rtol
=
rtol
)
def
test_cpu
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cpu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
def
test_cuda
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"cuda"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
# fmt: on
destroy_handle
(
lib
,
handle
)
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
in
test_cases
:
# Profiling workflow
if
PROFILE
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float16
)
test
(
lib
,
handle
,
"mlu"
,
x_shape
,
w_shape
,
pads
,
strides
,
dilations
,
x_strides
,
tensor_dtype
=
torch
.
float32
)
profile_operation
(
"PyTorch"
,
lambda
:
conv
(
x
.
torch_tensor
(),
w
.
torch_tensor
()
,
strides
,
pads
,
dilations
,
b
.
torch_tensor
()
if
b
is
not
None
else
None
),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
"
lib
"
,
lambda
:
lib_conv
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
destroy_handle
(
lib
,
handle
)
check_error
(
LIBINFINIOP
.
infiniopDestroyConvDescriptor
(
descriptor
)
)
if
__name__
==
"__main__"
:
test_cases
=
[
# x_shape, w_shape, pads, strides, dilations, x_strides
(
(
32
,
3
,
4
),
(
32
,
3
,
5
),
(
1
,),
(
1
,),
(
1
,),
None
,
),
(
(
1
,
3
,
4
,
4
),
(
2
,
3
,
3
,
3
),
(
1
,
1
),
(
1
,
2
),
(
2
,
1
),
None
,
),
(
(
32
,
3
,
128
,
128
),
(
64
,
3
,
5
,
5
),
(
2
,
2
),
(
2
,
2
),
(
1
,
1
),
None
,
),
(
(
1
,
1
,
4
,
4
,
4
),
(
1
,
1
,
5
,
5
,
5
),
(
1
,
1
,
1
),
(
1
,
1
,
1
),
(
1
,
1
,
1
),
None
,
),
(
(
32
,
3
,
32
,
32
,
32
),
(
64
,
3
,
5
,
5
,
5
),
(
3
,
2
,
2
),
(
4
,
3
,
3
),
(
2
,
2
,
1
),
None
,
),
]
args
=
get_args
()
lib
=
open_lib
()
lib
.
infiniopCreateConvDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateConvDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopConvDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
c_void_p
,
c_void_p
,
c_void_p
,
c_uint64
,
]
lib
.
infiniopConv
.
restype
=
c_int32
lib
.
infiniopConv
.
argtypes
=
[
infiniopConvDescriptor_t
,
c_void_p
,
c_uint64
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroyConvDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyConvDescriptor
.
argtypes
=
[
infiniopConvDescriptor_t
,
]
if
args
.
cpu
:
test_cpu
(
lib
,
test_cases
)
if
args
.
cuda
:
test_cuda
(
lib
,
test_cases
)
if
args
.
bang
:
test_bang
(
lib
,
test_case
s
)
if
not
(
args
.
cpu
or
args
.
cuda
or
args
.
bang
):
test_cpu
(
lib
,
test_cases
)
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
for
device
in
get_test_devices
(
arg
s
)
:
test_operator
(
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/libinfiniop/op_register.py
View file @
d417f967
...
...
@@ -453,3 +453,39 @@ def swiglu_(lib):
lib
.
infiniopDestroySwiGLUDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
@
OpRegister
.
operator
def
conv_
(
lib
):
lib
.
infiniopCreateConvDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateConvDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopOperatorDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
c_void_p
,
c_void_p
,
c_void_p
,
c_size_t
,
]
lib
.
infiniopGetConvWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetConvWorkspaceSize
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
POINTER
(
c_size_t
),
]
lib
.
infiniopConv
.
restype
=
c_int32
lib
.
infiniopConv
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
c_void_p
,
c_size_t
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroyConvDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyConvDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment