Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
0166515c
Unverified
Commit
0166515c
authored
Aug 07, 2025
by
PanZezhong1725
Committed by
GitHub
Aug 07, 2025
Browse files
Merge branch 'main' into issue/300
parents
f0300ff3
a23c4d13
Changes
175
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1423 additions
and
81 deletions
+1423
-81
src/infiniop/ops/conv/cpu/conv_cpu.cc
src/infiniop/ops/conv/cpu/conv_cpu.cc
+363
-0
src/infiniop/ops/conv/cpu/conv_cpu.h
src/infiniop/ops/conv/cpu/conv_cpu.h
+8
-0
src/infiniop/ops/conv/info.h
src/infiniop/ops/conv/info.h
+257
-0
src/infiniop/ops/conv/nvidia/conv_nvidia.cu
src/infiniop/ops/conv/nvidia/conv_nvidia.cu
+456
-0
src/infiniop/ops/conv/nvidia/conv_nvidia.cuh
src/infiniop/ops/conv/nvidia/conv_nvidia.cuh
+8
-0
src/infiniop/ops/conv/operator.cc
src/infiniop/ops/conv/operator.cc
+138
-0
src/infiniop/ops/gemm/maca/gemm_maca.h
src/infiniop/ops/gemm/maca/gemm_maca.h
+0
-8
src/infiniop/ops/gemm/metax/gemm_metax.cc
src/infiniop/ops/gemm/metax/gemm_metax.cc
+12
-11
src/infiniop/ops/gemm/metax/gemm_metax.h
src/infiniop/ops/gemm/metax/gemm_metax.h
+8
-0
src/infiniop/ops/gemm/nvidia/gemm_nvidia.cu
src/infiniop/ops/gemm/nvidia/gemm_nvidia.cu
+10
-10
src/infiniop/ops/gemm/nvidia/gemm_nvidia.cuh
src/infiniop/ops/gemm/nvidia/gemm_nvidia.cuh
+1
-1
src/infiniop/ops/gemm/operator.cc
src/infiniop/ops/gemm/operator.cc
+27
-15
src/infiniop/ops/mul/cpu/mul_cpu.cc
src/infiniop/ops/mul/cpu/mul_cpu.cc
+3
-1
src/infiniop/ops/mul/cuda/kernel.cuh
src/infiniop/ops/mul/cuda/kernel.cuh
+2
-5
src/infiniop/ops/mul/metax/mul_metax.h
src/infiniop/ops/mul/metax/mul_metax.h
+8
-0
src/infiniop/ops/mul/metax/mul_metax.maca
src/infiniop/ops/mul/metax/mul_metax.maca
+61
-0
src/infiniop/ops/mul/nvidia/mul_nvidia.cu
src/infiniop/ops/mul/nvidia/mul_nvidia.cu
+13
-9
src/infiniop/ops/mul/nvidia/mul_nvidia.cuh
src/infiniop/ops/mul/nvidia/mul_nvidia.cuh
+8
-0
src/infiniop/ops/mul/operator.cc
src/infiniop/ops/mul/operator.cc
+40
-13
src/infiniop/ops/random_sample/maca/random_sample_maca.h
src/infiniop/ops/random_sample/maca/random_sample_maca.h
+0
-8
No files found.
src/infiniop/ops/conv/cpu/conv_cpu.cc
0 → 100644
View file @
0166515c
#include "conv_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include <algorithm>
namespace
op
::
conv
::
cpu
{
inline
size_t
calculatePaddedInputSize
(
const
ConvInfo
&
info
)
{
std
::
vector
<
size_t
>
shape
(
info
.
ndim
()
+
2
);
shape
[
0
]
=
info
.
batch
();
shape
[
1
]
=
info
.
in_channels
();
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
shape
[
i
+
2
]
=
info
.
input_dim
(
i
);
}
return
op
::
common_cpu
::
getPaddedSize
(
info
.
ndim
()
+
2
,
shape
.
data
(),
info
.
getPadsInfo
());
}
inline
size_t
calculateOutputSize
(
const
ConvInfo
&
info
)
{
size_t
size
=
info
.
batch
()
*
info
.
out_channels
();
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
size
*=
info
.
output_dim
(
i
);
}
return
size
;
}
inline
bool
needsPadding
(
const
ConvInfo
&
info
)
{
const
size_t
*
pads
=
info
.
getPadsInfo
();
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
if
(
pads
[
i
]
>
0
)
{
return
true
;
}
}
return
false
;
}
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
const
void
*
pads
,
const
void
*
strides
,
const
void
*
dilations
,
size_t
n
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
auto
result
=
ConvInfo
::
create
(
handle_
,
y_desc
,
x_desc
,
w_desc
,
b_desc
,
pads
,
strides
,
dilations
,
n
);
CHECK_RESULT
(
result
);
size_t
WorkSpaceSize
=
0
;
const
ConvInfo
&
info
=
result
.
take
();
if
(
needsPadding
(
info
))
{
WorkSpaceSize
+=
calculatePaddedInputSize
(
info
)
*
infiniSizeOf
(
dtype
);
}
if
(
dtype
==
INFINI_DTYPE_F16
||
dtype
==
INFINI_DTYPE_BF16
)
{
WorkSpaceSize
+=
calculateOutputSize
(
info
)
*
sizeof
(
float
);
}
*
desc_ptr
=
new
Descriptor
(
dtype
,
std
::
move
(
info
),
WorkSpaceSize
,
nullptr
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
typename
Tdata
>
void
fillPaddedInput
(
const
ConvInfo
&
info
,
const
Tdata
*
x
,
const
size_t
*
padded_x_shape
,
Tdata
*
padded_x
,
size_t
x_index
,
size_t
padded_x_index
,
size_t
ndim
)
{
size_t
x_shape_val
;
if
(
ndim
==
0
)
{
x_shape_val
=
info
.
batch
();
}
else
if
(
ndim
==
1
)
{
x_shape_val
=
info
.
in_channels
();
}
else
{
x_shape_val
=
info
.
input_dim
(
ndim
-
2
);
}
const
auto
padded_x_shape_val
=
padded_x_shape
[
ndim
];
const
auto
x_base_index
=
x_index
*
x_shape_val
;
size_t
pad_offset
=
0
;
if
(
ndim
>=
2
&&
x_shape_val
!=
padded_x_shape_val
)
{
pad_offset
=
info
.
pad_info
(
ndim
-
2
);
}
const
auto
padded_x_base_index
=
padded_x_index
*
padded_x_shape_val
+
pad_offset
;
for
(
size_t
i
=
0
;
i
<
x_shape_val
;
++
i
)
{
if
(
ndim
==
info
.
ndim
()
+
2
-
1
)
{
padded_x
[
padded_x_base_index
+
i
]
=
x
[
x_base_index
+
i
];
}
else
{
fillPaddedInput
(
info
,
x
,
padded_x_shape
,
padded_x
,
x_base_index
+
i
,
padded_x_base_index
+
i
,
ndim
+
1
);
}
}
}
template
<
typename
Xdata
,
typename
Ydata
>
void
_applyConv
(
const
ConvInfo
&
info
,
Ydata
*
y
,
const
Xdata
*
x
,
const
Xdata
*
w
,
const
size_t
*
x_shape
,
size_t
x_index
,
size_t
w_index
,
size_t
y_index
,
size_t
ndim
)
{
size_t
dim_size
,
kernel_size
;
size_t
dilation
,
stride
;
if
(
ndim
<
2
)
{
return
;
}
else
{
dim_size
=
x_shape
[
ndim
];
kernel_size
=
info
.
kernel_dim
(
ndim
-
2
);
dilation
=
info
.
dilation_info
(
ndim
-
2
);
stride
=
info
.
stride_info
(
ndim
-
2
);
}
if
(
stride
==
0
)
{
std
::
cerr
<<
"Error: stride cannot be zero."
<<
std
::
endl
;
}
const
auto
steps
=
(
dim_size
-
dilation
*
(
kernel_size
-
1
)
-
1
)
/
stride
+
1
;
x_index
*=
dim_size
;
w_index
*=
kernel_size
;
size_t
y_stride
;
if
(
ndim
==
0
)
{
y_stride
=
info
.
out_channels
();
}
else
if
(
ndim
==
1
)
{
y_stride
=
1
;
}
else
{
y_stride
=
info
.
output_dim
(
ndim
-
2
);
}
y_index
*=
y_stride
;
for
(
size_t
i
=
0
;
i
<
steps
;
++
i
,
++
y_index
)
{
for
(
size_t
k
=
0
;
k
<
kernel_size
;
++
k
)
{
const
auto
curr_x_index
=
x_index
+
i
*
stride
+
k
*
dilation
;
const
auto
curr_w_index
=
w_index
+
k
;
if
(
ndim
==
info
.
ndim
()
+
1
)
{
if
constexpr
(
std
::
is_same
<
Xdata
,
fp16_t
>::
value
||
std
::
is_same
<
Xdata
,
bf16_t
>::
value
)
{
y
[
y_index
]
+=
utils
::
cast
<
float
>
(
x
[
curr_x_index
])
*
utils
::
cast
<
float
>
(
w
[
curr_w_index
]);
}
else
{
y
[
y_index
]
+=
x
[
curr_x_index
]
*
w
[
curr_w_index
];
}
}
else
{
_applyConv
(
info
,
y
,
x
,
w
,
x_shape
,
curr_x_index
,
curr_w_index
,
y_index
,
ndim
+
1
);
}
}
}
}
template
<
typename
Xdata
,
typename
Ydata
>
void
applyConv
(
const
ConvInfo
&
info
,
Ydata
*
y
,
const
Xdata
*
x
,
const
Xdata
*
w
,
const
size_t
*
x_shape
)
{
const
ptrdiff_t
batch_size
=
static_cast
<
ptrdiff_t
>
(
info
.
batch
());
const
ptrdiff_t
out_channels
=
static_cast
<
ptrdiff_t
>
(
info
.
out_channels
());
const
ptrdiff_t
total_iterations
=
batch_size
*
out_channels
;
#pragma omp parallel for schedule(dynamic)
for
(
ptrdiff_t
iter
=
0
;
iter
<
total_iterations
;
++
iter
)
{
const
ptrdiff_t
i
=
iter
/
out_channels
;
// batch index
const
ptrdiff_t
j
=
iter
%
out_channels
;
// output channel index
const
size_t
y_index
=
static_cast
<
size_t
>
(
i
)
*
info
.
out_channels
()
+
static_cast
<
size_t
>
(
j
);
// 内层循环:遍历输入通道
for
(
size_t
k
=
0
;
k
<
info
.
in_channels
();
++
k
)
{
const
size_t
x_index
=
static_cast
<
size_t
>
(
i
)
*
info
.
in_channels
()
+
k
;
const
size_t
w_index
=
static_cast
<
size_t
>
(
j
)
*
info
.
in_channels
()
+
k
;
_applyConv
(
info
,
y
,
x
,
w
,
x_shape
,
x_index
,
w_index
,
y_index
,
2
);
}
}
}
template
<
typename
Xdata
,
typename
Ydata
>
void
_conv_cpu
(
const
ConvInfo
&
info
,
void
*
workspace
,
size_t
workspace_size
,
Ydata
*
y
,
const
Xdata
*
x
,
const
Xdata
*
w
)
{
if
(
needsPadding
(
info
))
{
auto
padded_x
=
reinterpret_cast
<
Xdata
*>
(
workspace
);
if
constexpr
(
std
::
is_same
<
Xdata
,
fp16_t
>::
value
)
{
fp16_t
zero_val
=
utils
::
cast
<
fp16_t
>
(
0.0
f
);
std
::
fill
(
padded_x
,
padded_x
+
calculatePaddedInputSize
(
info
),
zero_val
);
}
else
if
constexpr
(
std
::
is_same
<
Xdata
,
bf16_t
>::
value
)
{
bf16_t
zero_val
=
utils
::
cast
<
bf16_t
>
(
0.0
f
);
std
::
fill
(
padded_x
,
padded_x
+
calculatePaddedInputSize
(
info
),
zero_val
);
}
else
if
constexpr
(
std
::
is_same
<
Xdata
,
float
>::
value
)
{
std
::
fill
(
padded_x
,
padded_x
+
calculatePaddedInputSize
(
info
),
0.0
f
);
}
fillPaddedInput
(
info
,
x
,
info
.
getPaddedShape
(),
padded_x
,
0
,
0
,
0
);
applyConv
(
info
,
y
,
padded_x
,
w
,
info
.
getPaddedShape
());
}
else
{
std
::
vector
<
size_t
>
shape
(
info
.
ndim
()
+
2
);
shape
[
0
]
=
info
.
batch
();
shape
[
1
]
=
info
.
in_channels
();
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
shape
[
i
+
2
]
=
info
.
input_dim
(
i
);
}
applyConv
(
info
,
y
,
x
,
w
,
shape
.
data
());
}
}
template
<
typename
Tdata
>
infiniStatus_t
conv_cpu
(
const
ConvInfo
&
info
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
)
{
auto
y_ptr
=
reinterpret_cast
<
Tdata
*>
(
y
);
auto
x_ptr
=
reinterpret_cast
<
const
Tdata
*>
(
x
);
auto
w_ptr
=
reinterpret_cast
<
const
Tdata
*>
(
w
);
auto
output_size
=
calculateOutputSize
(
info
);
if
constexpr
(
std
::
is_same
<
Tdata
,
float
>::
value
)
{
std
::
fill
(
y_ptr
,
y_ptr
+
output_size
,
0.0
f
);
}
else
{
std
::
fill
(
y_ptr
,
y_ptr
+
output_size
,
static_cast
<
Tdata
>
(
0
));
}
_conv_cpu
<
Tdata
,
Tdata
>
(
info
,
workspace
,
workspace_size
,
y_ptr
,
x_ptr
,
w_ptr
);
if
(
bias
!=
nullptr
)
{
auto
bias_ptr
=
reinterpret_cast
<
const
Tdata
*>
(
bias
);
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
size_t
channel_idx
=
(
i
/
info
.
spatial_sizes
())
%
info
.
out_channels
();
y_ptr
[
i
]
+=
bias_ptr
[
channel_idx
];
}
}
return
INFINI_STATUS_SUCCESS
;
}
template
<
>
infiniStatus_t
conv_cpu
<
fp16_t
>
(
const
ConvInfo
&
info
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
)
{
auto
y_float
=
reinterpret_cast
<
float
*>
(
workspace
);
auto
x_half
=
reinterpret_cast
<
const
fp16_t
*>
(
x
);
auto
w_half
=
reinterpret_cast
<
const
fp16_t
*>
(
w
);
auto
output_size
=
calculateOutputSize
(
info
);
std
::
fill
(
y_float
,
y_float
+
output_size
,
0.0
f
);
void
*
conv_workspace
=
y_float
+
output_size
;
size_t
conv_workspace_size
=
workspace_size
-
output_size
*
sizeof
(
float
);
_conv_cpu
<
fp16_t
,
float
>
(
info
,
conv_workspace
,
conv_workspace_size
,
y_float
,
x_half
,
w_half
);
auto
y_half
=
reinterpret_cast
<
fp16_t
*>
(
y
);
if
(
bias
!=
nullptr
)
{
auto
bias_half
=
reinterpret_cast
<
const
fp16_t
*>
(
bias
);
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
size_t
channel_idx
=
(
i
/
info
.
spatial_sizes
())
%
info
.
out_channels
();
float
bias_value
=
utils
::
cast
<
float
>
(
bias_half
[
channel_idx
]);
y_float
[
i
]
+=
bias_value
;
y_half
[
i
]
=
utils
::
cast
<
fp16_t
>
(
y_float
[
i
]);
}
}
else
{
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
y_half
[
i
]
=
utils
::
cast
<
fp16_t
>
(
y_float
[
i
]);
}
}
return
INFINI_STATUS_SUCCESS
;
}
template
<
>
infiniStatus_t
conv_cpu
<
bf16_t
>
(
const
ConvInfo
&
info
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
)
{
auto
y_float
=
reinterpret_cast
<
float
*>
(
workspace
);
auto
x_half
=
reinterpret_cast
<
const
bf16_t
*>
(
x
);
auto
w_half
=
reinterpret_cast
<
const
bf16_t
*>
(
w
);
auto
output_size
=
calculateOutputSize
(
info
);
std
::
fill
(
y_float
,
y_float
+
output_size
,
0.0
f
);
void
*
conv_workspace
=
y_float
+
output_size
;
size_t
conv_workspace_size
=
workspace_size
-
output_size
*
sizeof
(
float
);
_conv_cpu
<
bf16_t
,
float
>
(
info
,
conv_workspace
,
conv_workspace_size
,
y_float
,
x_half
,
w_half
);
auto
y_half
=
reinterpret_cast
<
bf16_t
*>
(
y
);
if
(
bias
!=
nullptr
)
{
auto
bias_half
=
reinterpret_cast
<
const
bf16_t
*>
(
bias
);
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
size_t
channel_idx
=
(
i
/
info
.
spatial_sizes
())
%
info
.
out_channels
();
float
bias_value
=
utils
::
cast
<
float
>
(
bias_half
[
channel_idx
]);
y_float
[
i
]
+=
bias_value
;
y_half
[
i
]
=
utils
::
cast
<
bf16_t
>
(
y_float
[
i
]);
}
}
else
{
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
static_cast
<
ptrdiff_t
>
(
output_size
);
++
i
)
{
y_half
[
i
]
=
utils
::
cast
<
bf16_t
>
(
y_float
[
i
]);
}
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
conv_cpu
<
fp16_t
>
(
_info
,
workspace
,
workspace_size
,
y
,
x
,
w
,
bias
);
case
INFINI_DTYPE_F32
:
return
conv_cpu
<
float
>
(
_info
,
workspace
,
workspace_size
,
y
,
x
,
w
,
bias
);
case
INFINI_DTYPE_BF16
:
return
conv_cpu
<
bf16_t
>
(
_info
,
workspace
,
workspace_size
,
y
,
x
,
w
,
bias
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
}
// namespace op::conv::cpu
src/infiniop/ops/conv/cpu/conv_cpu.h
0 → 100644
View file @
0166515c
#ifndef __CONV_CPU_H__
#define __CONV_CPU_H__
#include "../conv.h"
DESCRIPTOR
(
cpu
)
#endif // __CONV_CPU_H__
src/infiniop/ops/conv/info.h
0 → 100644
View file @
0166515c
#ifndef __CONV_INFO_H__
#define __CONV_INFO_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#ifdef ENABLE_CUDA_API
#include "../../devices/nvidia/nvidia_handle.cuh"
#endif
namespace
op
::
conv
{
class
ConvInfo
;
}
// namespace op::conv
namespace
op
::
conv
{
class
ConvInfo
{
private:
std
::
vector
<
size_t
>
_meta
;
size_t
_ndim
;
size_t
_batch
;
size_t
_in_channels
;
size_t
_out_channels
;
size_t
_spatial_sizes
;
size_t
_bias_dims_size
;
size_t
_padded_shape_size
;
ConvInfo
(
std
::
vector
<
size_t
>
meta
,
size_t
ndim
,
size_t
batch
,
size_t
in_channels
,
size_t
out_channels
,
size_t
spatial_sizes
,
size_t
bias_dims_size
,
size_t
padded_shape_size
)
:
_meta
(
std
::
move
(
meta
)),
_ndim
(
ndim
),
_batch
(
batch
),
_in_channels
(
in_channels
),
_out_channels
(
out_channels
),
_spatial_sizes
(
spatial_sizes
),
_bias_dims_size
(
bias_dims_size
),
_padded_shape_size
(
padded_shape_size
)
{}
public:
inline
size_t
ndim
()
const
{
return
_ndim
;
}
inline
size_t
batch
()
const
{
return
_batch
;
}
inline
size_t
in_channels
()
const
{
return
_in_channels
;
}
inline
size_t
out_channels
()
const
{
return
_out_channels
;
}
inline
size_t
spatial_sizes
()
const
{
return
_spatial_sizes
;
}
inline
size_t
bias_dims_size
()
const
{
return
_bias_dims_size
;
}
inline
size_t
padded_shape_size
()
const
{
return
_padded_shape_size
;
}
inline
size_t
getMetaMemSize
()
const
{
return
_meta
.
size
()
*
sizeof
(
size_t
);
}
inline
const
int8_t
*
getMetaStart
()
const
{
return
reinterpret_cast
<
const
int8_t
*>
(
_meta
.
data
());
}
inline
const
size_t
*
getInputDims
()
const
{
return
_meta
.
data
();
}
inline
const
size_t
*
getKernelDims
()
const
{
return
getInputDims
()
+
_ndim
;
}
inline
const
size_t
*
getOutputDims
()
const
{
return
getKernelDims
()
+
_ndim
;
}
inline
const
size_t
*
getBiasDims
()
const
{
return
getOutputDims
()
+
_ndim
;
}
inline
const
size_t
*
getPadsInfo
()
const
{
return
getBiasDims
()
+
_bias_dims_size
;
}
inline
const
ptrdiff_t
*
getStridesInfo
()
const
{
return
reinterpret_cast
<
const
ptrdiff_t
*>
(
getPadsInfo
())
+
_ndim
;
}
inline
const
size_t
*
getDilationsInfo
()
const
{
return
reinterpret_cast
<
const
size_t
*>
(
getStridesInfo
())
+
_ndim
;
}
inline
const
size_t
*
getPaddedShape
()
const
{
return
getDilationsInfo
()
+
_ndim
;
}
inline
size_t
input_dim
(
size_t
i
)
const
{
return
i
<
_ndim
?
getInputDims
()[
i
]
:
0
;
}
inline
size_t
kernel_dim
(
size_t
i
)
const
{
return
i
<
_ndim
?
getKernelDims
()[
i
]
:
0
;
}
inline
size_t
output_dim
(
size_t
i
)
const
{
return
i
<
_ndim
?
getOutputDims
()[
i
]
:
0
;
}
inline
size_t
bias_dim
(
size_t
i
)
const
{
return
i
<
_bias_dims_size
?
getBiasDims
()[
i
]
:
0
;
}
inline
size_t
pad_info
(
size_t
i
)
const
{
return
i
<
_ndim
?
getPadsInfo
()[
i
]
:
0
;
}
inline
ptrdiff_t
stride_info
(
size_t
i
)
const
{
return
i
<
_ndim
?
getStridesInfo
()[
i
]
:
0
;
}
inline
size_t
dilation_info
(
size_t
i
)
const
{
return
i
<
_ndim
?
getDilationsInfo
()[
i
]
:
0
;
}
inline
size_t
padded_shape_dim
(
size_t
i
)
const
{
return
i
<
_padded_shape_size
?
getPaddedShape
()[
i
]
:
0
;
}
static
utils
::
Result
<
ConvInfo
>
create
(
infiniopHandle_t
handle_
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
const
void
*
pads
,
const
void
*
strides
,
const
void
*
dilations
,
size_t
n
);
};
inline
utils
::
Result
<
size_t
>
calculateConvOutputSize
(
size_t
input_size
,
size_t
kernel_size
,
size_t
padding
,
size_t
stride
,
size_t
dilation
)
{
if
(
stride
==
0
)
{
return
utils
::
Result
<
size_t
>
(
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
if
(
dilation
==
0
)
{
return
utils
::
Result
<
size_t
>
(
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
if
(
kernel_size
==
0
)
{
return
utils
::
Result
<
size_t
>
(
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
size_t
effective_kernel
=
dilation
*
(
kernel_size
-
1
)
+
1
;
size_t
padded_input
=
input_size
+
2
*
padding
;
if
(
padded_input
<
effective_kernel
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
size_t
output_size
=
(
padded_input
-
effective_kernel
)
/
stride
+
1
;
return
utils
::
Result
<
size_t
>
(
output_size
);
}
inline
utils
::
Result
<
ConvInfo
>
ConvInfo
::
create
(
infiniopHandle_t
handle_
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
const
void
*
pads
,
const
void
*
strides
,
const
void
*
dilations
,
size_t
n
)
{
auto
dtype
=
y_desc
->
dtype
();
if
(
dtype
!=
x_desc
->
dtype
()
||
dtype
!=
w_desc
->
dtype
())
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
size_t
ndim
=
n
;
size_t
new_dims
=
n
+
2
;
if
(
x_desc
->
ndim
()
<
new_dims
||
y_desc
->
ndim
()
<
new_dims
||
w_desc
->
ndim
()
<
new_dims
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
size_t
batch
=
x_desc
->
shape
()[
0
];
size_t
in_channels
=
x_desc
->
shape
()[
1
];
size_t
out_channels
=
w_desc
->
shape
()[
0
];
if
(
y_desc
->
shape
()[
0
]
!=
batch
||
y_desc
->
shape
()[
1
]
!=
out_channels
||
w_desc
->
shape
()[
1
]
!=
in_channels
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
size_t
bias_dims_size
=
(
b_desc
!=
nullptr
)
?
x_desc
->
ndim
()
:
0
;
const
size_t
*
pads_ptr
=
reinterpret_cast
<
const
size_t
*>
(
pads
);
bool
has_padding
=
false
;
if
(
pads_ptr
!=
nullptr
)
{
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
pads_ptr
[
i
]
>
0
)
{
has_padding
=
true
;
break
;
}
}
}
size_t
padded_shape_size
=
has_padding
?
(
ndim
+
2
)
:
0
;
// 计算meta总大小
size_t
meta_size
=
ndim
*
6
+
bias_dims_size
+
padded_shape_size
;
std
::
vector
<
size_t
>
meta
(
meta_size
);
size_t
*
input_dims
=
meta
.
data
();
size_t
*
kernel_dims
=
input_dims
+
ndim
;
size_t
*
output_dims
=
kernel_dims
+
ndim
;
size_t
*
bias_dims
=
output_dims
+
ndim
;
size_t
*
pads_info
=
bias_dims
+
bias_dims_size
;
ptrdiff_t
*
strides_info
=
reinterpret_cast
<
ptrdiff_t
*>
(
pads_info
)
+
ndim
;
size_t
*
dilations_info
=
reinterpret_cast
<
size_t
*>
(
strides_info
)
+
ndim
;
size_t
*
padded_shape
=
dilations_info
+
ndim
;
const
ptrdiff_t
*
strides_ptr
=
reinterpret_cast
<
const
ptrdiff_t
*>
(
strides
);
const
size_t
*
dilations_ptr
=
reinterpret_cast
<
const
size_t
*>
(
dilations
);
size_t
spatial_sizes
=
1
;
for
(
size_t
i
=
0
;
i
<
ndim
;
i
++
)
{
input_dims
[
i
]
=
x_desc
->
shape
()[
i
+
2
];
kernel_dims
[
i
]
=
w_desc
->
shape
()[
i
+
2
];
output_dims
[
i
]
=
y_desc
->
shape
()[
i
+
2
];
pads_info
[
i
]
=
pads_ptr
==
nullptr
?
0
:
pads_ptr
[
i
];
strides_info
[
i
]
=
strides_ptr
==
nullptr
?
1
:
strides_ptr
[
i
];
dilations_info
[
i
]
=
dilations_ptr
==
nullptr
?
1
:
dilations_ptr
[
i
];
spatial_sizes
=
spatial_sizes
*
output_dims
[
i
];
auto
output_result
=
calculateConvOutputSize
(
input_dims
[
i
],
kernel_dims
[
i
],
pads_info
[
i
],
strides_info
[
i
],
dilations_info
[
i
]);
CHECK_RESULT
(
output_result
);
size_t
expected_output
=
output_result
.
take
();
if
(
output_dims
[
i
]
!=
expected_output
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
}
if
(
bias_dims_size
>
0
)
{
std
::
fill
(
bias_dims
,
bias_dims
+
bias_dims_size
,
1
);
bias_dims
[
1
]
=
b_desc
->
shape
()[
0
];
}
if
(
padded_shape_size
>
0
)
{
padded_shape
[
0
]
=
batch
;
padded_shape
[
1
]
=
in_channels
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
padded_shape
[
i
+
2
]
=
input_dims
[
i
]
+
2
*
pads_info
[
i
];
}
}
ConvInfo
info
(
std
::
move
(
meta
),
ndim
,
batch
,
in_channels
,
out_channels
,
spatial_sizes
,
bias_dims_size
,
padded_shape_size
);
return
utils
::
Result
<
ConvInfo
>
(
info
);
}
}
// namespace op::conv
#endif // __CONV_INFO_H__
src/infiniop/ops/conv/nvidia/conv_nvidia.cu
0 → 100644
View file @
0166515c
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "conv_nvidia.cuh"
#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
do { \
if (desc_ptr) { \
destroy_func(desc_ptr); \
desc_ptr = nullptr; \
} \
} while (0)
#define CLEANUP_CUDNN_DESCRIPTORS() \
do { \
DESTROY_CUDNN_DESCRIPTOR(x_desc, cudnnDestroyTensorDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(y_desc, cudnnDestroyTensorDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(w_desc, cudnnDestroyFilterDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(b_desc, cudnnDestroyTensorDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(act_desc, cudnnDestroyActivationDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \
} while (0)
namespace
op
::
conv
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
size_t
workspace_size
=
0
;
#ifdef ENABLE_CUDNN_API
cudnnTensorDescriptor_t
x_desc
=
nullptr
;
cudnnTensorDescriptor_t
y_desc
=
nullptr
;
cudnnFilterDescriptor_t
w_desc
=
nullptr
;
cudnnTensorDescriptor_t
b_desc
=
nullptr
;
cudnnActivationDescriptor_t
act_desc
=
nullptr
;
cudnnConvolutionDescriptor_t
conv_desc
=
nullptr
;
cudnnConvolutionFwdAlgo_t
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
#endif
private:
Opaque
(
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal_ptr
)
:
internal
(
internal_ptr
)
{}
#ifdef ENABLE_CUDNN_API
void
initializeDimensionArrays
(
const
ConvInfo
&
info
,
std
::
vector
<
int
>
&
input_dims
,
std
::
vector
<
int
>
&
output_dims
,
std
::
vector
<
int
>
&
filter_dims
,
std
::
vector
<
int
>
&
input_strides
,
std
::
vector
<
int
>
&
output_strides
)
const
{
bool
is_1d_conv
=
(
info
.
ndim
()
==
1
);
int
actual_tensor_ndim
=
is_1d_conv
?
4
:
static_cast
<
int
>
(
info
.
ndim
()
+
2
);
input_dims
[
0
]
=
static_cast
<
int
>
(
info
.
batch
());
input_dims
[
1
]
=
static_cast
<
int
>
(
info
.
in_channels
());
output_dims
[
0
]
=
static_cast
<
int
>
(
info
.
batch
());
output_dims
[
1
]
=
static_cast
<
int
>
(
info
.
out_channels
());
filter_dims
[
0
]
=
static_cast
<
int
>
(
info
.
out_channels
());
filter_dims
[
1
]
=
static_cast
<
int
>
(
info
.
in_channels
());
if
(
is_1d_conv
)
{
input_dims
[
2
]
=
1
;
input_dims
[
3
]
=
static_cast
<
int
>
(
info
.
input_dim
(
0
));
output_dims
[
2
]
=
1
;
output_dims
[
3
]
=
static_cast
<
int
>
(
info
.
output_dim
(
0
));
filter_dims
[
2
]
=
1
;
filter_dims
[
3
]
=
static_cast
<
int
>
(
info
.
kernel_dim
(
0
));
}
else
{
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
input_dims
[
i
+
2
]
=
static_cast
<
int
>
(
info
.
input_dim
(
i
));
output_dims
[
i
+
2
]
=
static_cast
<
int
>
(
info
.
output_dim
(
i
));
filter_dims
[
i
+
2
]
=
static_cast
<
int
>
(
info
.
kernel_dim
(
i
));
}
}
calculateStrides
(
input_dims
,
input_strides
,
actual_tensor_ndim
);
calculateStrides
(
output_dims
,
output_strides
,
actual_tensor_ndim
);
}
void
initializeConvolutionParams
(
const
ConvInfo
&
info
,
std
::
vector
<
int
>
&
pads
,
std
::
vector
<
int
>
&
strides
,
std
::
vector
<
int
>
&
dilations
)
const
{
bool
is_1d_conv
=
(
info
.
ndim
()
==
1
);
if
(
is_1d_conv
)
{
pads
[
0
]
=
0
;
pads
[
1
]
=
static_cast
<
int
>
(
info
.
pad_info
(
0
));
strides
[
0
]
=
1
;
strides
[
1
]
=
static_cast
<
int
>
(
info
.
stride_info
(
0
));
dilations
[
0
]
=
1
;
dilations
[
1
]
=
static_cast
<
int
>
(
info
.
dilation_info
(
0
));
}
else
{
for
(
size_t
i
=
0
;
i
<
info
.
ndim
();
++
i
)
{
pads
[
i
]
=
static_cast
<
int
>
(
info
.
pad_info
(
i
));
strides
[
i
]
=
static_cast
<
int
>
(
info
.
stride_info
(
i
));
dilations
[
i
]
=
static_cast
<
int
>
(
info
.
dilation_info
(
i
));
}
}
}
void
calculateStrides
(
const
std
::
vector
<
int
>
&
dims
,
std
::
vector
<
int
>
&
strides
,
int
ndim
)
const
{
strides
[
ndim
-
1
]
=
1
;
for
(
int
d
=
ndim
-
2
;
d
>=
0
;
--
d
)
{
strides
[
d
]
=
strides
[
d
+
1
]
*
dims
[
d
+
1
];
}
}
infiniStatus_t
getCudnnDataType
(
infiniDtype_t
data_type
,
cudnnDataType_t
&
cudnn_data_type
)
const
{
if
(
data_type
==
INFINI_DTYPE_F16
)
{
cudnn_data_type
=
device
::
nvidia
::
getCudnnDtype
(
data_type
);
}
else
if
(
data_type
==
INFINI_DTYPE_F32
)
{
cudnn_data_type
=
device
::
nvidia
::
getCudnnDtype
(
data_type
);
}
else
if
(
data_type
==
INFINI_DTYPE_BF16
)
{
cudnn_data_type
=
device
::
nvidia
::
getCudnnDtype
(
data_type
);
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
createBasicDescriptors
(
const
std
::
vector
<
int
>
&
input_dims
,
const
std
::
vector
<
int
>
&
output_dims
,
const
std
::
vector
<
int
>
&
filter_dims
,
cudnnDataType_t
cudnn_data_type
,
int
actual_tensor_ndim
)
{
CHECK_CUDNN
(
cudnnCreateTensorDescriptor
(
&
x_desc
));
CHECK_CUDNN
(
cudnnCreateTensorDescriptor
(
&
y_desc
));
CHECK_CUDNN
(
cudnnCreateFilterDescriptor
(
&
w_desc
));
CHECK_CUDNN
(
cudnnCreateConvolutionDescriptor
(
&
conv_desc
));
CHECK_CUDNN
(
cudnnSetTensorNdDescriptorEx
(
x_desc
,
CUDNN_TENSOR_NCHW
,
cudnn_data_type
,
actual_tensor_ndim
,
input_dims
.
data
()));
CHECK_CUDNN
(
cudnnSetTensorNdDescriptorEx
(
y_desc
,
CUDNN_TENSOR_NCHW
,
cudnn_data_type
,
actual_tensor_ndim
,
output_dims
.
data
()));
CHECK_CUDNN
(
cudnnSetFilterNdDescriptor
(
w_desc
,
cudnn_data_type
,
CUDNN_TENSOR_NCHW
,
actual_tensor_ndim
,
filter_dims
.
data
()));
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
createBiasDescriptors
(
const
ConvInfo
&
info
,
cudnnDataType_t
cudnn_data_type
,
int
actual_tensor_ndim
)
{
if
(
info
.
bias_dims_size
()
==
0
)
{
b_desc
=
nullptr
;
act_desc
=
nullptr
;
return
INFINI_STATUS_SUCCESS
;
}
std
::
vector
<
int
>
bias_dims_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
bias_strides_arr
(
actual_tensor_ndim
);
bias_dims_arr
[
0
]
=
1
;
bias_dims_arr
[
1
]
=
static_cast
<
int
>
(
info
.
out_channels
());
for
(
int
i
=
2
;
i
<
actual_tensor_ndim
;
++
i
)
{
bias_dims_arr
[
i
]
=
1
;
}
if
(
actual_tensor_ndim
==
4
)
{
bias_strides_arr
[
0
]
=
static_cast
<
int
>
(
info
.
out_channels
());
bias_strides_arr
[
1
]
=
1
;
bias_strides_arr
[
2
]
=
1
;
bias_strides_arr
[
3
]
=
1
;
}
else
{
calculateStrides
(
bias_dims_arr
,
bias_strides_arr
,
actual_tensor_ndim
);
}
CHECK_CUDNN
(
cudnnCreateTensorDescriptor
(
&
b_desc
));
CHECK_CUDNN
(
cudnnSetTensorNdDescriptor
(
b_desc
,
cudnn_data_type
,
static_cast
<
int
>
(
bias_dims_arr
.
size
()),
bias_dims_arr
.
data
(),
bias_strides_arr
.
data
()));
CHECK_CUDNN
(
cudnnCreateActivationDescriptor
(
&
act_desc
));
CHECK_CUDNN
(
cudnnSetActivationDescriptor
(
act_desc
,
CUDNN_ACTIVATION_IDENTITY
,
CUDNN_NOT_PROPAGATE_NAN
,
0.0
));
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
setupConvolutionDescriptor
(
const
std
::
vector
<
int
>
&
pads
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
dilations
,
int
spatial_ndim
,
cudnnDataType_t
compute_type
)
{
CHECK_CUDNN
(
cudnnSetConvolutionNdDescriptor
(
conv_desc
,
spatial_ndim
,
pads
.
data
(),
strides
.
data
(),
dilations
.
data
(),
CUDNN_CROSS_CORRELATION
,
compute_type
));
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
setupAlgorithmWithoutBias
()
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
;
CHECK_STATUS
(
internal
->
useCudnn
(
nullptr
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
x_desc
,
w_desc
,
conv_desc
,
y_desc
,
algo
,
&
workspace_size
));
return
INFINI_STATUS_SUCCESS
;
}));
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
setupAlgorithmWithBias
()
{
int
maxAlgoCount
=
0
;
CHECK_STATUS
(
internal
->
useCudnn
(
nullptr
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnGetConvolutionForwardAlgorithmMaxCount
(
handle
,
&
maxAlgoCount
));
return
INFINI_STATUS_SUCCESS
;
}));
if
(
maxAlgoCount
<=
0
)
{
maxAlgoCount
=
8
;
}
std
::
vector
<
cudnnConvolutionFwdAlgoPerf_t
>
perf_results
(
maxAlgoCount
);
int
algoCounts
=
0
;
CHECK_STATUS
(
internal
->
useCudnn
(
nullptr
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnFindConvolutionForwardAlgorithm
(
handle
,
x_desc
,
w_desc
,
conv_desc
,
y_desc
,
maxAlgoCount
,
&
algoCounts
,
perf_results
.
data
()));
return
INFINI_STATUS_SUCCESS
;
}));
if
(
algoCounts
<
1
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
for
(
int
i
=
0
;
i
<
algoCounts
;
++
i
)
{
CHECK_STATUS
(
internal
->
useCudnn
(
nullptr
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
x_desc
,
w_desc
,
conv_desc
,
y_desc
,
perf_results
[
i
].
algo
,
&
workspace_size
));
return
INFINI_STATUS_SUCCESS
;
}));
algo
=
perf_results
[
i
].
algo
;
break
;
}
return
INFINI_STATUS_SUCCESS
;
}
#endif
public:
Opaque
(
Opaque
&&
other
)
noexcept
:
internal
(
std
::
move
(
other
.
internal
)),
workspace_size
(
other
.
workspace_size
)
// clang-format off
#ifdef ENABLE_CUDNN_API
,
x_desc
(
other
.
x_desc
)
,
y_desc
(
other
.
y_desc
)
,
w_desc
(
other
.
w_desc
)
,
b_desc
(
other
.
b_desc
)
,
act_desc
(
other
.
act_desc
)
,
conv_desc
(
other
.
conv_desc
)
,
algo
(
other
.
algo
)
#endif
// clang-format on
{
#ifdef ENABLE_CUDNN_API
other
.
x_desc
=
nullptr
;
other
.
y_desc
=
nullptr
;
other
.
w_desc
=
nullptr
;
other
.
b_desc
=
nullptr
;
other
.
act_desc
=
nullptr
;
other
.
conv_desc
=
nullptr
;
#endif
other
.
workspace_size
=
0
;
}
~
Opaque
()
{
#ifdef ENABLE_CUDNN_API
CLEANUP_CUDNN_DESCRIPTORS
();
#endif
}
#ifdef ENABLE_CUDNN_API
infiniStatus_t
initializeCudnnContext
(
ConvInfo
&
info
,
infiniDtype_t
data_type
,
cudnnDataType_t
compute_type
)
{
bool
is_1d_conv
=
(
info
.
ndim
()
==
1
);
int
actual_tensor_ndim
=
is_1d_conv
?
4
:
static_cast
<
int
>
(
info
.
ndim
()
+
2
);
int
spatial_ndim_for_conv_desc
=
is_1d_conv
?
2
:
static_cast
<
int
>
(
info
.
ndim
());
std
::
vector
<
int
>
input_dims_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
output_dims_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
filter_dims_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
input_strides_arr
(
actual_tensor_ndim
);
std
::
vector
<
int
>
output_strides_arr
(
actual_tensor_ndim
);
initializeDimensionArrays
(
info
,
input_dims_arr
,
output_dims_arr
,
filter_dims_arr
,
input_strides_arr
,
output_strides_arr
);
std
::
vector
<
int
>
pads_arr
(
spatial_ndim_for_conv_desc
);
std
::
vector
<
int
>
strides_arr
(
spatial_ndim_for_conv_desc
);
std
::
vector
<
int
>
dilations_arr
(
spatial_ndim_for_conv_desc
);
initializeConvolutionParams
(
info
,
pads_arr
,
strides_arr
,
dilations_arr
);
cudnnDataType_t
cudnn_data_type
;
CHECK_STATUS
(
getCudnnDataType
(
data_type
,
cudnn_data_type
));
CHECK_STATUS
(
createBasicDescriptors
(
input_dims_arr
,
output_dims_arr
,
filter_dims_arr
,
cudnn_data_type
,
actual_tensor_ndim
));
CHECK_STATUS
(
createBiasDescriptors
(
info
,
cudnn_data_type
,
actual_tensor_ndim
));
CHECK_STATUS
(
setupConvolutionDescriptor
(
pads_arr
,
strides_arr
,
dilations_arr
,
spatial_ndim_for_conv_desc
,
compute_type
));
if
(
info
.
bias_dims_size
()
==
0
)
{
CHECK_STATUS
(
setupAlgorithmWithoutBias
());
}
else
{
CHECK_STATUS
(
setupAlgorithmWithBias
());
}
return
INFINI_STATUS_SUCCESS
;
}
#endif
static
inline
utils
::
Result
<
Opaque
>
create
(
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal_ptr
,
ConvInfo
&
info
,
infiniDtype_t
data_type
)
{
#ifdef ENABLE_CUDNN_API
Opaque
opaque
(
internal_ptr
);
auto
status
=
opaque
.
initializeCudnnContext
(
info
,
data_type
,
CUDNN_DATA_FLOAT
);
if
(
status
!=
INFINI_STATUS_SUCCESS
)
{
return
status
;
}
return
utils
::
Result
<
Opaque
>
(
std
::
move
(
opaque
));
#else
return
INFINI_STATUS_NOT_IMPLEMENTED
;
#endif
}
};
Descriptor
::~
Descriptor
()
{
if
(
_opaque
)
{
delete
_opaque
;
}
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
const
void
*
pads
,
const
void
*
strides
,
const
void
*
dilations
,
size_t
n
)
{
#ifdef ENABLE_CUDNN_API
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
auto
result
=
ConvInfo
::
create
(
handle_
,
y_desc
,
x_desc
,
w_desc
,
b_desc
,
pads
,
strides
,
dilations
,
n
);
CHECK_RESULT
(
result
);
auto
conv_info
=
result
.
take
();
auto
opaque_result
=
Opaque
::
create
(
handle
->
internal
(),
conv_info
,
dtype
);
CHECK_RESULT
(
opaque_result
);
auto
opaque
=
new
Opaque
(
opaque_result
.
take
());
*
desc_ptr
=
new
Descriptor
(
dtype
,
std
::
move
(
conv_info
),
opaque
->
workspace_size
,
opaque
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
#else
return
INFINI_STATUS_NOT_IMPLEMENTED
;
#endif
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
,
void
*
stream
)
const
{
#ifdef ENABLE_CUDNN_API
const
float
alpha
=
1.0
f
,
beta
=
0.0
f
;
if
(
bias
!=
nullptr
)
{
CHECK_STATUS
(
_opaque
->
internal
->
useCudnn
(
(
cudaStream_t
)
stream
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnConvolutionBiasActivationForward
(
handle
,
&
alpha
,
_opaque
->
x_desc
,
x
,
_opaque
->
w_desc
,
w
,
_opaque
->
conv_desc
,
_opaque
->
algo
,
workspace
,
workspace_size
,
&
beta
,
_opaque
->
y_desc
,
y
,
_opaque
->
b_desc
,
bias
,
_opaque
->
act_desc
,
_opaque
->
y_desc
,
y
));
return
INFINI_STATUS_SUCCESS
;
}));
}
else
{
CHECK_STATUS
(
_opaque
->
internal
->
useCudnn
(
(
cudaStream_t
)
stream
,
[
&
](
cudnnHandle_t
handle
)
{
CHECK_CUDNN
(
cudnnConvolutionForward
(
handle
,
&
alpha
,
_opaque
->
x_desc
,
x
,
_opaque
->
w_desc
,
w
,
_opaque
->
conv_desc
,
_opaque
->
algo
,
workspace
,
workspace_size
,
&
beta
,
_opaque
->
y_desc
,
y
));
return
INFINI_STATUS_SUCCESS
;
}));
}
return
INFINI_STATUS_SUCCESS
;
#else
return
INFINI_STATUS_NOT_IMPLEMENTED
;
#endif
}
}
// namespace op::conv::nvidia
src/infiniop/ops/conv/nvidia/conv_nvidia.cuh
0 → 100644
View file @
0166515c
#ifndef __CONV_CUDA_CUH__
#define __CONV_CUDA_CUH__
#include "../conv.h"
DESCRIPTOR
(
nvidia
)
#endif // __GEMM_CUDA_CUH__
src/infiniop/ops/conv/operator.cc
0 → 100644
View file @
0166515c
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/conv.h"
#ifdef ENABLE_CPU_API
#include "cpu/conv_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/conv_nvidia.cuh"
#endif
__C
__export
infiniStatus_t
infiniopCreateConvDescriptor
(
infiniopHandle_t
handle
,
infiniopConvDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
w_desc
,
infiniopTensorDescriptor_t
b_desc
,
void
*
pads
,
void
*
strides
,
void
*
dilations
,
size_t
n
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::conv::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::conv::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
x_desc, \
w_desc, \
b_desc, \
pads, \
strides, \
dilations, \
n)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetConvWorkspaceSize
(
infiniopConvDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::conv::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
}
__C
infiniStatus_t
infiniopConv
(
infiniopConvDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
w
,
const
void
*
bias
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::conv::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, \
y, \
x, \
w, \
bias, \
stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyConvDescriptor
(
infiniopConvDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::conv::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/gemm/maca/gemm_maca.h
deleted
100644 → 0
View file @
f0300ff3
#ifndef __GEMM_MACA_H__
#define __GEMM_MACA_H__
#include "../gemm.h"
DESCRIPTOR
(
maca
)
#endif // __GEMM_MACA_H__
src/infiniop/ops/gemm/m
aca
/gemm_m
aca
.cc
→
src/infiniop/ops/gemm/m
etax
/gemm_m
etax
.cc
View file @
0166515c
#include "gemm_m
aca
.h"
#include "../../../devices/m
aca/
common
_maca
.h"
#include "../../../devices/m
aca/maca
_handle.h"
#include "gemm_m
etax
.h"
#include "../../../devices/m
etax/metax_
common.h"
#include "../../../devices/m
etax/metax
_handle.h"
namespace
op
::
gemm
::
m
aca
{
namespace
op
::
gemm
::
m
etax
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
m
aca
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
m
etax
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
...
...
@@ -18,12 +18,10 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
m
aca
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
m
etax
::
Handle
*>
(
handle_
);
auto
dtype
=
c_desc
->
dtype
();
if
(
dtype
!=
INFINI_DTYPE_F16
&&
dtype
!=
INFINI_DTYPE_F32
)
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
auto
result
=
MatmulInfo
::
create
(
c_desc
,
a_desc
,
b_desc
,
MatrixLayout
::
COL_MAJOR
);
CHECK_RESULT
(
result
);
...
...
@@ -53,7 +51,10 @@ infiniStatus_t Descriptor::calculate(
a_type
=
b_type
=
c_type
=
HPCC_R_16F
;
compute_type
=
HCBLAS_COMPUTE_32F
;
break
;
case
INFINI_DTYPE_BF16
:
a_type
=
b_type
=
c_type
=
HPCC_R_16BF
;
compute_type
=
HCBLAS_COMPUTE_32F
;
break
;
case
INFINI_DTYPE_F32
:
a_type
=
b_type
=
c_type
=
HPCC_R_32F
;
compute_type
=
HCBLAS_COMPUTE_32F_FAST_TF32
;
...
...
@@ -103,4 +104,4 @@ infiniStatus_t Descriptor::calculate(
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::gemm::m
aca
}
// namespace op::gemm::m
etax
src/infiniop/ops/gemm/metax/gemm_metax.h
0 → 100644
View file @
0166515c
#ifndef __GEMM_METAX_H__
#define __GEMM_METAX_H__
#include "../gemm.h"
DESCRIPTOR
(
metax
)
#endif // __GEMM_METAX_H__
src/infiniop/ops/gemm/
cud
a/gemm_
cud
a.cu
→
src/infiniop/ops/gemm/
nvidi
a/gemm_
nvidi
a.cu
View file @
0166515c
#include "../../../devices/
cuda/cud
a_handle.cuh"
#include "gemm_
cud
a.cuh"
#include "../../../devices/
nvidia/nvidi
a_handle.cuh"
#include "gemm_
nvidi
a.cuh"
namespace
op
::
gemm
::
cud
a
{
namespace
op
::
gemm
::
nvidi
a
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
...
...
@@ -17,7 +17,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
nvidia
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
dtype
=
c_desc
->
dtype
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
...
...
@@ -43,7 +43,7 @@ infiniStatus_t Descriptor::calculate(
void
*
stream
)
const
{
cudaDataType
a_type
,
b_type
,
c_type
;
#ifdef ENABLE_ILUVATAR_
CUDA_
API
#ifdef ENABLE_ILUVATAR_API
cudaDataType
compute_type
;
#else
cublasComputeType_t
compute_type
;
...
...
@@ -52,7 +52,7 @@ infiniStatus_t Descriptor::calculate(
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
a_type
=
b_type
=
c_type
=
CUDA_R_16F
;
#ifdef ENABLE_ILUVATAR_
CUDA_
API
#ifdef ENABLE_ILUVATAR_API
compute_type
=
CUDA_R_32F
;
#else
compute_type
=
CUBLAS_COMPUTE_32F
;
...
...
@@ -60,7 +60,7 @@ infiniStatus_t Descriptor::calculate(
break
;
case
INFINI_DTYPE_BF16
:
a_type
=
b_type
=
c_type
=
CUDA_R_16BF
;
#ifdef ENABLE_ILUVATAR_
CUDA_
API
#ifdef ENABLE_ILUVATAR_API
compute_type
=
CUDA_R_32F
;
#else
compute_type
=
CUBLAS_COMPUTE_32F
;
...
...
@@ -68,7 +68,7 @@ infiniStatus_t Descriptor::calculate(
break
;
case
INFINI_DTYPE_F32
:
a_type
=
b_type
=
c_type
=
CUDA_R_32F
;
#if defined ENABLE_ILUVATAR_
CUDA_
API
#if defined ENABLE_ILUVATAR_API
compute_type
=
CUDA_R_32F
;
#elif defined ENABLE_SUGON_CUDA_API
compute_type
=
CUBLAS_COMPUTE_32F
;
...
...
@@ -121,4 +121,4 @@ infiniStatus_t Descriptor::calculate(
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::gemm::
cud
a
}
// namespace op::gemm::
nvidi
a
src/infiniop/ops/gemm/
cud
a/gemm_
cud
a.cuh
→
src/infiniop/ops/gemm/
nvidi
a/gemm_
nvidi
a.cuh
View file @
0166515c
...
...
@@ -3,6 +3,6 @@
#include "../gemm.h"
DESCRIPTOR
(
cud
a
)
DESCRIPTOR
(
nvidi
a
)
#endif // __GEMM_CUDA_CUH__
src/infiniop/ops/gemm/operator.cc
View file @
0166515c
...
...
@@ -5,8 +5,8 @@
#ifdef ENABLE_CPU_API
#include "cpu/gemm_cpu.h"
#endif
#ifdef
ENABLE_CUDA
_API
#include "
cud
a/gemm_
cud
a.cuh"
#if
def
ined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR
_API
)
#include "
nvidi
a/gemm_
nvidi
a.cuh"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/gemm_bang.h"
...
...
@@ -15,7 +15,7 @@
#include "ascend/gemm_ascend.h"
#endif
#ifdef ENABLE_METAX_API
#include "m
aca
/gemm_m
aca
.h"
#include "m
etax
/gemm_m
etax
.h"
#endif
#ifdef ENABLE_MOORE_API
#include "musa/gemm_musa.h"
...
...
@@ -45,8 +45,11 @@ __C infiniStatus_t infiniopCreateGemmDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
...
...
@@ -55,7 +58,7 @@ __C infiniStatus_t infiniopCreateGemmDescriptor(
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CREATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
musa
);
...
...
@@ -87,8 +90,11 @@ infiniopGetGemmWorkspaceSize(
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_CAMBRICON_API
GET
(
INFINI_DEVICE_CAMBRICON
,
bang
);
...
...
@@ -97,7 +103,7 @@ infiniopGetGemmWorkspaceSize(
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
m
aca
);
GET
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
musa
);
...
...
@@ -136,8 +142,11 @@ __C infiniStatus_t infiniopGemm(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
...
...
@@ -146,7 +155,7 @@ __C infiniStatus_t infiniopGemm(
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CALCULATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
musa
);
...
...
@@ -175,8 +184,11 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
...
...
@@ -185,7 +197,7 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
m
aca
);
DELETE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
musa
);
...
...
src/infiniop/ops/mul/cpu/mul_cpu.cc
View file @
0166515c
...
...
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
out_shape
,
a_shape
,
b_shape
);
...
...
@@ -43,6 +43,8 @@ infiniStatus_t Descriptor::calculate(
return
_device_info
->
calculate
<
MulOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
MulOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
MulOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
...
...
src/infiniop/ops/mul/cuda/
mul_cuda_int
ern
a
l.cuh
→
src/infiniop/ops/mul/cuda/
k
ern
e
l.cuh
View file @
0166515c
#ifndef __MUL_CUDA_H__
#define __MUL_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace
op
::
mul
::
cuda
{
typedef
struct
MulOp
{
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
||
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
return
__hmul2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
||
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
return
__hmul
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rn
(
a
,
b
);
...
...
src/infiniop/ops/mul/metax/mul_metax.h
0 → 100644
View file @
0166515c
#ifndef __MUL_METAX_API_H__
#define __MUL_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
mul
,
metax
)
#endif // __MUL_METAX_API_H__
src/infiniop/ops/mul/metax/mul_metax.maca
0 → 100644
View file @
0166515c
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
#include "mul_metax.h"
namespace op::mul::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::MulOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::MulOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::MulOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::MulOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::mul::metax
src/infiniop/ops/
add/cuda/add_cud
a.cu
→
src/infiniop/ops/
mul/nvidia/mul_nvidi
a.cu
View file @
0166515c
#include "add_cuda.cuh"
#include "add_cuda_internal.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
namespace
op
::
add
::
cuda
{
#include "../cuda/kernel.cuh"
#include "mul_nvidia.cuh"
namespace
op
::
mul
::
nvidia
{
Descriptor
::~
Descriptor
()
=
default
;
...
...
@@ -11,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
...
...
@@ -20,7 +22,7 @@ infiniStatus_t Descriptor::create(
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
...
...
@@ -43,15 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
Add
Op
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
Mul
Op
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
Add
Op
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
Mul
Op
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
AddOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
MulOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
MulOp
,
cuda_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::
add::cud
a
}
// namespace op::
mul::nvidi
a
src/infiniop/ops/mul/
cud
a/mul_
cud
a.cuh
→
src/infiniop/ops/mul/
nvidi
a/mul_
nvidi
a.cuh
View file @
0166515c
#ifndef __MUL_CUDA_API_H__
#define __MUL_CUDA_API_H__
#include "../../../elementwise/
cud
a/elementwise_
cud
a_api.cuh"
#include "../../../elementwise/
nvidi
a/elementwise_
nvidi
a_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
mul
,
cud
a
)
ELEMENTWISE_DESCRIPTOR
(
mul
,
nvidi
a
)
#endif // __MUL_CUDA_API_H__
src/infiniop/ops/mul/operator.cc
View file @
0166515c
...
...
@@ -5,9 +5,11 @@
#ifdef ENABLE_CPU_API
#include "cpu/mul_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/mul_cuda.cuh"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/mul_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/mul_metax.h"
#endif
__C
infiniStatus_t
infiniopCreateMulDescriptor
(
...
...
@@ -31,8 +33,14 @@ __C infiniStatus_t infiniopCreateMulDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
...
...
@@ -47,15 +55,22 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_
CUDA
_API
GET
(
INFINI_DEVICE_
NVIDIA
,
cud
a
)
#ifdef ENABLE_
ILUVATAR
_API
GET
(
INFINI_DEVICE_
ILUVATAR
,
nvidi
a
)
;
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -83,8 +98,14 @@ __C infiniStatus_t infiniopMul(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
...
...
@@ -107,8 +128,14 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
...
...
src/infiniop/ops/random_sample/maca/random_sample_maca.h
deleted
100644 → 0
View file @
f0300ff3
#ifndef __RANDOM_SAMPLE_MACA_H__
#define __RANDOM_SAMPLE_MACA_H__
#include "../random_sample.h"
DESCRIPTOR
(
maca
)
#endif // __RANDOM_SAMPLE_MACA_H__
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment