Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9f8ab221
Unverified
Commit
9f8ab221
authored
Oct 19, 2023
by
zjing14
Committed by
GitHub
Oct 19, 2023
Browse files
Merge branch 'develop' into add_int8_wmma_example_instance
parents
755ace59
b4fc4d0b
Changes
490
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2490 additions
and
564 deletions
+2490
-564
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
...y/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
+282
-0
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
...conv_tensor_rearrange/device_column_to_image_instance.hpp
+106
-0
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
...conv_tensor_rearrange/device_image_to_column_instance.hpp
+8
-24
library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
...nsor_operation_instance/gpu/convolution_backward_data.hpp
+16
-6
library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
...ary/tensor_operation_instance/gpu/convolution_forward.hpp
+8
-7
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
...include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+43
-0
library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
...k/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
+11
-9
library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
.../ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
+68
-17
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_instance.hpp
...v_bwd_data/device_grouped_conv_bwd_data_wmma_instance.hpp
+118
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
...nv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
+39
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
...bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
+0
-2
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
...d_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
+119
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
...wd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
+45
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv2d_fwd_wmma_instance.hpp
...uped_conv_fwd/device_grouped_conv2d_fwd_wmma_instance.hpp
+0
-134
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp
...rouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp
+136
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
...grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+40
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
...ration_instance/gpu/grouped_convolution_backward_data.hpp
+360
-8
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
...tion_instance/gpu/grouped_convolution_backward_weight.hpp
+291
-69
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
...or_operation_instance/gpu/grouped_convolution_forward.hpp
+800
-153
library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp
...library/tensor_operation_instance/gpu/image_to_column.hpp
+0
-135
No files found.
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
0 → 100644
View file @
9f8ab221
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
conv_tensor_rearrange_op
;
// Image to Column
// nhwc, 1d
void
add_device_image_to_column_nwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// nhwc, 2d
void
add_device_image_to_column_nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// nhwc, 3d
void
add_device_image_to_column_ndhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_ndhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_ndhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_ndhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// Column to Image
// nhwc, 1d
void
add_device_column_to_image_nwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
// nhwc, 2d
void
add_device_column_to_image_nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
// nhwc, 3d
void
add_device_column_to_image_ndhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_ndhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_ndhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_ndhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
template
<
ck
::
index_t
NumDimSpatial
,
typename
ImageLayout
,
typename
InDataType
,
typename
OutDataType
,
typename
ConvTensorRearrangeOp
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceConvTensorRearrange
<
NumDimSpatial
,
ImageLayout
,
InDataType
,
OutDataType
,
ConvTensorRearrangeOp
>>
{
using
DeviceOp
=
DeviceConvTensorRearrange
<
NumDimSpatial
,
ImageLayout
,
InDataType
,
OutDataType
,
ConvTensorRearrangeOp
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
ConvTensorRearrangeOp
,
ImageToColumn
>
)
{
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
ImageLayout
,
GNWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nwc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nwc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nwc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nwc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
ImageLayout
,
GNHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
ImageLayout
,
GNDHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_ndhwc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_ndhwc_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_ndhwc_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_ndhwc_3d_i8_instances
(
op_ptrs
);
}
}
}
else
if
constexpr
(
is_same_v
<
ConvTensorRearrangeOp
,
ColumnToImage
>
)
{
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
ImageLayout
,
GNWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_nwc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_nwc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_nwc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_nwc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
ImageLayout
,
GNHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_nhwc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_nhwc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_nhwc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_nhwc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
ImageLayout
,
GNDHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_ndhwc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_ndhwc_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_ndhwc_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_ndhwc_3d_i8_instances
(
op_ptrs
);
}
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
0 → 100644
View file @
9f8ab221
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
tensor_layout
::
convolution
;
using
namespace
ck
::
conv_tensor_rearrange_op
;
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_column_to_image_bf16_instances
=
std
::
tuple
<
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
// generic instance
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
>
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_column_to_image_f16_instances
=
std
::
tuple
<
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
// generic instance
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
>
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_column_to_image_f32_instances
=
std
::
tuple
<
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
// generic instance
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
>
using
device_column_to_image_i8_instances
=
std
::
tuple
<
// clang-format off
//#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar|
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
// generic instance
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
>
,
DeviceColumnToImageImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
256
,
256
,
S
<
16
,
16
>
,
16
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/
image_to_column
/device_image_to_column_instance.hpp
→
library/include/ck/library/tensor_operation_instance/gpu/
conv_tensor_rearrange
/device_image_to_column_instance.hpp
View file @
9f8ab221
...
...
@@ -13,6 +13,7 @@ namespace device {
namespace
instance
{
using
namespace
ck
::
tensor_layout
::
convolution
;
using
namespace
ck
::
conv_tensor_rearrange_op
;
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
...
...
@@ -28,17 +29,12 @@ using device_image_to_column_bf16_instances = std::tuple<
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
>
,
// generic instance
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
BF16
,
BF16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
>
...
...
@@ -52,17 +48,13 @@ using device_image_to_column_f16_instances = std::tuple<
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
>
,
// generic instance
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F16
,
F16
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
>
...
...
@@ -76,15 +68,11 @@ using device_image_to_column_f32_instances = std::tuple<
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
>
,
// generic instance
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
F32
,
F32
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
// clang-format on
...
...
@@ -97,17 +85,13 @@ using device_image_to_column_i8_instances = std::tuple<
//#####################| Dim| | | | Size| Block| Block| Cluster| Per|
//#####################| Spatial| | | | | | | Lengths| Vector|
//#####################| | | | | | | | | |
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
8
,
8
,
S
<
8
,
8
>
,
1
>
,
// generic instance
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
16
,
16
,
S
<
8
,
8
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
32
,
32
,
S
<
8
,
8
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
64
,
64
,
64
,
S
<
8
,
8
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
16
,
16
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
64
,
64
,
S
<
8
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
32
,
64
,
S
<
8
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
128
,
64
,
128
,
S
<
8
,
16
>
,
8
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
16
,
16
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
64
,
64
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
1
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
64
,
64
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
4
>
,
DeviceImageToColumnImpl
<
NDimSpatial
,
InLayout
,
int8_t
,
int8_t
,
256
,
128
,
128
,
S
<
16
,
16
>
,
8
>
,
...
...
library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
View file @
9f8ab221
...
...
@@ -240,11 +240,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
InLayout
,
NWC
>
&&
is_same_v
<
WeiLayout
,
KXC
>
&&
is_same_v
<
OutLayout
,
NWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
...
...
@@ -267,17 +269,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
}
#endif
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
InLayout
,
NHWC
>
&&
is_same_v
<
WeiLayout
,
KYXC
>
&&
is_same_v
<
OutLayout
,
NHWK
>
)
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
InLayout
,
NHWC
>
&&
is_same_v
<
WeiLayout
,
KYXC
>
&&
is_same_v
<
OutLayout
,
NHWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances
(
op_ptrs
);
#ifdef DL_KERNELS
add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances
(
op_ptrs
);
}
#endif
#if defined(DL_KERNELS) && defined(CK_ENABLE_FP32)
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
...
...
@@ -306,14 +314,16 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBw
}
#endif
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
NDHWC
>
&&
is_same_v
<
WeiLayout
,
KZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWK
>
)
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
NDHWC
>
&&
is_same_v
<
WeiLayout
,
KZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
...
...
library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
View file @
9f8ab221
...
...
@@ -98,30 +98,31 @@ struct DeviceOperationInstanceFactory<
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
InLayout
,
NHWC
>
&&
is_same_v
<
WeiLayout
,
KYXC
>
&&
is_same_v
<
OutLayout
,
NHWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
op_ptrs
);
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
op_ptrs
);
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
View file @
9f8ab221
...
...
@@ -312,6 +312,23 @@ void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(
DeviceGemm
<
Row
,
Col
,
Row
,
F64
,
F64
,
F64
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP8
void
add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Row
,
Row
,
F8
,
F8
,
F8
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Col
,
Row
,
F8
,
F8
,
F8
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Row
,
Row
,
F8
,
F8
,
F8
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Col
,
Row
,
F8
,
F8
,
F8
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
...
...
@@ -505,6 +522,32 @@ struct DeviceOperationInstanceFactory<
#endif
}
}
#endif
#ifdef CK_ENABLE_FP8
else
if
constexpr
(
is_same_v
<
ADataType
,
ck
::
f8_t
>
&&
is_same_v
<
BDataType
,
ck
::
f8_t
>
&&
is_same_v
<
CDataType
,
ck
::
f8_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
View file @
9f8ab221
...
...
@@ -11,12 +11,12 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#ifdef CK_ENABLE_FP16
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
#ifdef CK_ENABLE_FP16
void
add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Row
,
...
...
@@ -68,7 +68,8 @@ void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
...
...
@@ -120,7 +121,7 @@ void add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances(
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
#endif
// GEMM + Bilinear
template
<
typename
ALayout
,
typename
BLayout
,
...
...
@@ -158,7 +159,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
DDataType
,
half_t
>
&&
is_same_v
<
EDataType
,
half_t
>
)
{
...
...
@@ -187,8 +188,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ADataType
,
std
::
int8_t
>
&&
is_same_v
<
BDataType
,
std
::
int8_t
>
&&
is_same_v
<
DDataType
,
std
::
int8_t
>
&&
is_same_v
<
EDataType
,
std
::
int8_t
>
)
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
ADataType
,
std
::
int8_t
>
&&
is_same_v
<
BDataType
,
std
::
int8_t
>
&&
is_same_v
<
DDataType
,
std
::
int8_t
>
&&
is_same_v
<
EDataType
,
std
::
int8_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
DLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
...
...
@@ -211,7 +214,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
};
...
...
@@ -220,4 +223,3 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
View file @
9f8ab221
...
...
@@ -16,7 +16,7 @@ namespace ck {
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
#ifdef CK_ENABLE_FP16
void
add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
...
...
@@ -36,7 +36,8 @@ void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Col
,
Row
,
Row
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
...
...
@@ -56,8 +57,8 @@ void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Row
,
Col
,
Row
,
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#if defined
CK_ENABLE_FP8
#endif
#if
(defined(CK_ENABLE_FP16) ||
defined
(
CK_ENABLE_FP8
))
void
add_device_gemm_xdl_splitk_f8_f16_f16_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Col
,
Row
,
Row
,
F8
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
...
...
@@ -97,6 +98,26 @@ void add_device_gemm_xdl_splitk_f16_f8_f16_mk_nk_mn_instances(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Row
,
Col
,
Row
,
F16
,
F8
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_comp_f8_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
F8
>>>&
instances
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_comp_f8_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
F8
>>>&
instances
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_comp_f8_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
F8
>>>&
instances
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_comp_f8_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmSplitK
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
F8
>>>&
instances
);
#endif
template
<
typename
ADataType
,
...
...
@@ -104,7 +125,8 @@ template <typename ADataType,
typename
CDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
typename
CLayout
,
typename
ComputeType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmSplitK
<
ALayout
,
BLayout
,
...
...
@@ -114,7 +136,8 @@ struct DeviceOperationInstanceFactory<
CDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>>
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ComputeType
>>
{
using
DeviceOp
=
DeviceGemmSplitK
<
ALayout
,
BLayout
,
...
...
@@ -124,14 +147,15 @@ struct DeviceOperationInstanceFactory<
CDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ComputeType
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
ADataType
,
float
>
&&
is_same_v
<
BDataType
,
float
>
&&
is_same_v
<
CDataType
,
float
>
)
is_same_v
<
CDataType
,
float
>
&&
is_same_v
<
ComputeType
,
float
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
...
...
@@ -154,8 +178,10 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
CDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
CDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
...
...
@@ -178,9 +204,10 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
}
}
#if defined CK_ENABLE_FP8
else
if
constexpr
(
is_same_v
<
ADataType
,
f8_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
CDataType
,
half_t
>
)
#endif
#if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8))
if
constexpr
(
is_same_v
<
ADataType
,
f8_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
CDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
...
...
@@ -203,8 +230,8 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_xdl_splitk_f8_f16_f16_km_nk_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
f8_t
>
&&
is_same_v
<
C
Data
Type
,
half_t
>
)
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
f8_t
>
&&
is_same_v
<
CDataType
,
half_t
>
&&
is_same_v
<
C
ompute
Type
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
...
...
@@ -227,8 +254,32 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_xdl_splitk_f16_f8_f16_km_nk_mn_instances
(
op_ptrs
);
}
}
#endif
else
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
CDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
f8_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_splitk_f16_f16_f16_comp_f8_mk_kn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_splitk_f16_f16_f16_comp_f8_mk_nk_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_splitk_f16_f16_f16_comp_f8_km_kn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_splitk_f16_f16_f16_comp_f8_km_nk_mn_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
};
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_wmma_instance.hpp
0 → 100644
View file @
9f8ab221
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
I8
=
int8_t
;
using
I32
=
int32_t
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
namespace
ck
::
tensor_layout
::
convolution
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvBwdDataDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Default
;
static
constexpr
auto
ConvBwdData1x1S1P0
=
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
;
template
<
index_t
NDSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
DsDatatype
,
typename
CDEElementOp
,
ConvolutionBackwardDataSpecialization
ConvSpec
>
using
device_grouped_conv_bwd_data_wmma_f16_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| ConvForward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
64
,
64
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
1
>
,
// blocksize=256
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
256
,
128
,
256
,
8
,
8
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
256
,
64
,
256
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
256
,
128
,
256
,
8
,
8
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
256
,
128
,
64
,
8
,
8
,
16
,
16
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
// blocksize=128
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
64
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
64
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
128
,
64
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
128
,
128
,
8
,
8
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
32
,
256
,
8
,
8
,
16
,
16
,
1
,
8
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
// blocksize=64
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
64
,
32
,
64
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
64
,
64
,
64
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
64
,
32
,
64
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
64
,
32
,
128
,
8
,
8
,
16
,
16
,
1
,
8
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
// blocksize=32
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
32
,
16
,
64
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
32
,
64
,
32
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
32
,
32
,
32
,
8
,
8
,
16
,
16
,
2
,
2
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
32
,
16
,
32
,
8
,
8
,
16
,
16
,
1
,
2
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
// clang-format on
>
;
template
<
index_t
NDSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
DsDatatype
,
typename
CDEElementOp
,
ConvolutionBackwardDataSpecialization
ConvSpec
>
using
device_grouped_conv_bwd_data_wmma_i8_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| ConvForward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
64
,
64
,
4
,
16
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
1
>
,
// blocksize=256
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
256
,
64
,
256
,
8
,
16
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
// blocksize=128
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
64
,
256
,
8
,
16
,
16
,
16
,
2
,
8
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
64
,
128
,
8
,
16
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
128
,
256
,
8
,
16
,
16
,
16
,
4
,
8
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
32
,
256
,
8
,
16
,
16
,
16
,
1
,
8
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
128
,
256
,
128
,
8
,
16
,
16
,
16
,
8
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
// blocksize=64
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
64
,
32
,
128
,
8
,
16
,
16
,
16
,
1
,
8
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
64
,
64
,
128
,
8
,
16
,
16
,
16
,
2
,
8
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
64
,
32
,
128
,
8
,
16
,
16
,
16
,
1
,
8
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
64
,
32
,
64
,
8
,
16
,
16
,
16
,
1
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
// blocksize=32
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
32
,
16
,
64
,
8
,
16
,
16
,
16
,
1
,
4
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
32
,
64
,
64
,
8
,
16
,
16
,
16
,
4
,
4
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
32
,
32
,
32
,
8
,
16
,
16
,
16
,
2
,
2
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
I32
,
I8
,
Empty_Tuple
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
32
,
16
,
64
,
8
,
16
,
16
,
16
,
1
,
4
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp
View file @
9f8ab221
...
...
@@ -18,6 +18,8 @@ namespace instance {
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
BF8
=
ck
::
bf8_t
;
using
F8
=
ck
::
f8_t
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
...
...
@@ -143,6 +145,43 @@ using device_grouped_conv_bwd_data_xdl_f32_instances =
// clang-format on
>
;
// f16_f16_f16_comp_f8
template
<
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
ConvolutionBackwardDataSpecialization
ConvSpec
>
using
device_grouped_conv_bwd_data_xdl_input_fp16_comp_bf8f8_instances
=
std
::
tuple
<
// clang-format off
// ##############################################| NDim| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl| CDEBlockTransfer| CDEBlockTransfer|
// ##############################################| Spatial| | | | | Type| Type| Type| DataType| Type| Type| Operation| Operation| Operation| DataSpecialization| GemmM| GemmN| PrefetchStage| Size| Block| Block| Block| | | XDL| XDL| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| PerWave| PerWave| _MBlock_MPerBlock| ScalarPerVector|
// ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | Lengths_AK0_M_AK1| ArrangeOrder| | | PerVector| PerVector_AK1| | Lengths_BK0_N_BK1| ArrangeOrder| | | PerVector| PerVector_BK1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock|
// ##############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
64
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
// instances for small conv.K and conv.C
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
128
,
128
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
1
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
256
,
128
,
256
,
32
,
8
,
2
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
0
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
256
,
128
,
256
,
32
,
8
,
8
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
128
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
128
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
128
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
64
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
128
,
128
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
128
,
32
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
64
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
LoopScheduler
::
Default
,
BF8
,
F8
>
,
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F32
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
true
,
true
,
1
,
64
,
32
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
LoopScheduler
::
Default
,
BF8
,
F8
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
View file @
9f8ab221
...
...
@@ -6,8 +6,6 @@
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
0 → 100644
View file @
9f8ab221
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
I8
=
int8_t
;
using
I32
=
int32_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
namespace
ck
::
tensor_layout
::
convolution
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvBwdWeightDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionBackwardWeightSpecialization
::
Default
;
static
constexpr
auto
ConvBwdWeightFilter1x1Stride1Pad0
=
ck
::
tensor_operation
::
device
::
ConvolutionBackwardWeightSpecialization
::
Filter1x1Stride1Pad0
;
template
<
index_t
NDSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
ConvolutionBackwardWeightSpecialization
ConvSpec
>
using
device_grouped_conv_bwd_weight_wmma_f16_instances
=
std
::
tuple
<
// clang-format off
//#####################################| NumDim| A| B| C| AData| BData| CData| AccData| A| B| C| ConvForward| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//#####################################| Spatial| Layout| Layout| Layout| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeatPerWave| NRepeatPerWave| _MBlock_MPerBlock| ScalarPerVector|
//#####################################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
64
,
64
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
1
>
,
// blocksize=256
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
128
,
256
,
8
,
8
,
16
,
16
,
2
,
8
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
4
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
256
,
128
,
8
,
8
,
16
,
16
,
8
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
2
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
256
,
64
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
4
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
64
,
256
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
// blocksize=128
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
64
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
128
,
64
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
128
,
128
,
8
,
8
,
16
,
16
,
4
,
4
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
32
,
256
,
8
,
8
,
16
,
16
,
1
,
8
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
256
,
32
,
8
,
8
,
16
,
16
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
// blocksize=64
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
32
,
8
,
8
,
16
,
16
,
4
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
32
,
64
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
64
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
128
,
32
,
8
,
8
,
16
,
16
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
32
,
128
,
8
,
8
,
16
,
16
,
1
,
8
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
// blocksize=32
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
16
,
32
,
8
,
8
,
16
,
16
,
1
,
2
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
16
,
64
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
32
,
64
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
32
,
32
,
8
,
8
,
16
,
16
,
2
,
2
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
64
,
32
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
64
,
16
,
8
,
8
,
16
,
16
,
4
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
32
,
16
,
8
,
8
,
16
,
16
,
2
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
// clang-format on
>
;
template
<
index_t
NDSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
ConvolutionBackwardWeightSpecialization
ConvSpec
>
using
device_grouped_conv_bwd_weight_wmma_i8_instances
=
std
::
tuple
<
// clang-format off
//#####################################| NumDim| A| B| C| AData| BData| CData| AccData| A| B| C| ConvForward| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//#####################################| Spatial| Layout| Layout| Layout| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeatPerWave| NRepeatPerWave| _MBlock_MPerBlock| ScalarPerVector|
//#####################################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
64
,
64
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
1
>
,
// blocksize=256
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
64
,
256
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
256
,
64
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
8
>
,
// blocksize=128
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
128
,
256
,
8
,
8
,
16
,
16
,
4
,
8
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
64
,
256
,
8
,
8
,
16
,
16
,
2
,
8
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
32
,
256
,
8
,
8
,
16
,
16
,
1
,
8
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
64
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
128
,
64
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
256
,
32
,
8
,
8
,
16
,
16
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
256
,
64
,
8
,
8
,
16
,
16
,
8
,
2
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
2
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
256
,
128
,
8
,
8
,
16
,
16
,
8
,
4
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
// blocksize=64
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
32
,
128
,
8
,
8
,
16
,
16
,
1
,
8
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
128
,
8
,
8
,
16
,
16
,
2
,
8
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
128
,
64
,
8
,
8
,
16
,
16
,
8
,
2
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
128
,
32
,
8
,
8
,
16
,
16
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
S
<
8
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
>
,
// blocksize=32
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
16
,
64
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
64
,
64
,
8
,
8
,
16
,
16
,
4
,
4
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
32
,
32
,
8
,
8
,
16
,
16
,
2
,
2
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvBwdWeight_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
CLayout
,
I8
,
I8
,
I8
,
I32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
32
,
64
,
16
,
8
,
8
,
16
,
16
,
4
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
8
,
1
,
S
<
8
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
View file @
9f8ab221
...
...
@@ -19,6 +19,14 @@ using BF16 = ck::bhalf_t;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
#ifdef CK_ENABLE_FP8
using
F8
=
ck
::
f8_t
;
#endif
#ifdef CK_ENABLE_BF8
using
BF8
=
ck
::
bf8_t
;
#endif
using
Empty_Tuple
=
ck
::
Tuple
<>
;
template
<
ck
::
index_t
...
Is
>
...
...
@@ -133,6 +141,43 @@ using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances = std::tuple<
// clang-format on
>
;
template
<
ck
::
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
ELayout
,
ConvolutionBackwardWeightSpecialization
ConvSpec
>
using
device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_instances
=
std
::
tuple
<
// clang-format off
//#########################################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData| In| Wei| Out| ConvBackward| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransfer| CBlockTransfer| Compute| Compute|
//#########################################| Dim| | | | Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Weight| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ScalarPerVector| TypeA| TypeB|
//#########################################| Spatial| | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| MBlock_MPerBlock| NWaveNPerXdl| | |
//#########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | | |
#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
// generic instance
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
64
,
4
,
8
,
32
,
32
,
2
,
2
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
2
,
4
,
true
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
2
,
4
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
2
,
BF8
,
F8
>
,
// instance for small conv.K
// for fp16 conv.K and conv.C must be divisible by 2
// since half_t atomic_add require scalar_per_x_vector % 2 == 0
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
128
,
32
,
4
,
8
,
32
,
32
,
2
,
1
,
S
<
1
,
4
,
16
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
S
<
1
,
4
,
4
,
8
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
2
,
1
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
2
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
32
,
64
,
4
,
8
,
32
,
32
,
1
,
2
,
S
<
1
,
4
,
4
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
2
,
2
,
true
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
256
,
128
,
4
,
8
,
32
,
32
,
4
,
2
,
S
<
1
,
4
,
32
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
S
<
1
,
4
,
16
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
128
,
256
,
4
,
8
,
32
,
32
,
2
,
4
,
S
<
1
,
4
,
16
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
S
<
1
,
4
,
32
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
128
,
128
,
4
,
8
,
32
,
32
,
4
,
2
,
S
<
1
,
4
,
16
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
S
<
1
,
4
,
16
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
128
,
128
,
4
,
8
,
32
,
32
,
2
,
2
,
S
<
1
,
4
,
16
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
S
<
1
,
4
,
16
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
128
,
64
,
4
,
8
,
32
,
32
,
2
,
2
,
S
<
1
,
4
,
16
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
S
<
1
,
4
,
8
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
64
,
128
,
4
,
8
,
32
,
32
,
2
,
2
,
S
<
1
,
4
,
8
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
S
<
1
,
4
,
16
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
64
,
4
,
8
,
32
,
32
,
2
,
2
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
128
,
64
,
4
,
8
,
32
,
32
,
2
,
1
,
S
<
1
,
4
,
16
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
S
<
1
,
4
,
8
,
8
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
1
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
256
,
64
,
128
,
4
,
8
,
32
,
32
,
1
,
2
,
S
<
1
,
4
,
8
,
8
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
1
,
true
,
S
<
1
,
4
,
16
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
128
,
32
,
4
,
8
,
32
,
32
,
2
,
1
,
S
<
1
,
4
,
16
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
S
<
1
,
4
,
4
,
8
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
1
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
128
,
32
,
128
,
4
,
8
,
32
,
32
,
1
,
2
,
S
<
1
,
4
,
4
,
8
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
1
,
true
,
S
<
1
,
4
,
16
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
64
,
32
,
4
,
8
,
32
,
32
,
2
,
1
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
S
<
1
,
4
,
4
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
BF8
,
F8
>
,
DeviceGroupedConvBwdWeight_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
ELayout
,
F16
,
F16
,
F16
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
64
,
32
,
64
,
4
,
8
,
32
,
32
,
1
,
2
,
S
<
1
,
4
,
4
,
4
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
2
,
true
,
S
<
1
,
4
,
8
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
2
,
1
,
3
>
,
2
,
8
,
4
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
BF8
,
F8
>
#endif
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv2d_fwd_wmma_instance.hpp
deleted
100644 → 0
View file @
755ace59
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
I8
=
int8_t
;
using
I32
=
int32_t
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
NHWGC
=
ck
::
tensor_layout
::
convolution
::
NHWGC
;
using
GNHWC
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
GKYXC
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
NHWGK
=
ck
::
tensor_layout
::
convolution
::
NHWGK
;
using
GNHWK
=
ck
::
tensor_layout
::
convolution
::
GNHWK
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
ConvFwd1x1P0
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Filter1x1Pad0
;
static
constexpr
auto
ConvFwd1x1S1P0
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
;
static
constexpr
auto
ConvFwdOddC
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
OddC
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
DsDatatype
,
typename
CDEElementOp
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv2d_fwd_wmma_f16_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| Ds| EData| AccData| CShuffle| A| B| CDE| ConvForward| GEMM| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| DataType| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// blocksize=256
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
128
,
128
,
4
,
8
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
64
,
256
,
4
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
256
,
64
,
4
,
8
,
16
,
16
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
128
,
128
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
// blocksize=128
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
8
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
128
,
4
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
128
,
64
,
4
,
8
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
128
,
64
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
32
,
256
,
4
,
8
,
16
,
16
,
1
,
8
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
256
,
32
,
4
,
8
,
16
,
16
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
// blocksize=64
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
64
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
64
,
32
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
32
,
8
,
8
,
16
,
16
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
128
,
4
,
8
,
16
,
16
,
1
,
8
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
// blocksize=32
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
16
,
64
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
64
,
16
,
4
,
8
,
16
,
16
,
4
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
32
,
32
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
16
,
16
,
4
,
8
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
// clang-format on
>
;
template
<
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
DsDatatype
,
typename
CDEElementOp
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv2d_fwd_wmma_i8_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| Ds| EData| AccData| CShuffle| A| B| CDE| ConvForward| GEMM| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| DataType| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// blocksize=256
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
128
,
128
,
4
,
16
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
64
,
256
,
4
,
16
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
256
,
64
,
4
,
16
,
16
,
16
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
128
,
128
,
8
,
16
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
// blocksize=128
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
4
,
16
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
8
,
16
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
128
,
4
,
16
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
128
,
8
,
16
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
128
,
64
,
4
,
16
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
128
,
64
,
8
,
16
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
32
,
256
,
4
,
16
,
16
,
16
,
1
,
8
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
256
,
32
,
4
,
16
,
16
,
16
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
// blocksize=64
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
64
,
4
,
16
,
16
,
16
,
1
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
64
,
32
,
4
,
16
,
16
,
16
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
32
,
8
,
16
,
16
,
16
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
128
,
4
,
16
,
16
,
16
,
1
,
8
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
// blocksize=32
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
16
,
64
,
4
,
16
,
16
,
16
,
1
,
4
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
64
,
16
,
4
,
16
,
16
,
16
,
4
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
32
,
32
,
4
,
16
,
16
,
16
,
2
,
2
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
2
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
16
,
16
,
4
,
16
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp
0 → 100644
View file @
9f8ab221
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
I8
=
int8_t
;
using
I32
=
int32_t
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
namespace
ck
::
tensor_layout
::
convolution
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
ConvFwd1x1P0
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Filter1x1Pad0
;
static
constexpr
auto
ConvFwd1x1S1P0
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
;
static
constexpr
auto
ConvFwdOddC
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
OddC
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
index_t
NDSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
DsDatatype
,
typename
CDEElementOp
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_wmma_f16_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| Ds| EData| AccData| CShuffle| A| B| CDE| ConvForward| GEMM| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| DataType| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
1
>
,
// blocksize=256
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
128
,
128
,
4
,
8
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
64
,
256
,
4
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
256
,
64
,
4
,
8
,
16
,
16
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
128
,
128
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
// blocksize=128
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
8
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
128
,
4
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
128
,
8
,
8
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
128
,
64
,
4
,
8
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
128
,
64
,
8
,
8
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
32
,
256
,
4
,
8
,
16
,
16
,
1
,
8
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
256
,
32
,
4
,
8
,
16
,
16
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
// blocksize=64
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
64
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
64
,
32
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
32
,
8
,
8
,
16
,
16
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
128
,
4
,
8
,
16
,
16
,
1
,
8
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
// blocksize=32
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
16
,
64
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
64
,
16
,
4
,
8
,
16
,
16
,
4
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
32
,
32
,
4
,
8
,
16
,
16
,
2
,
2
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
DsDatatype
,
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
16
,
16
,
4
,
8
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
// clang-format on
>
;
template
<
index_t
NDSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
typename
DsDatatype
,
typename
CDEElementOp
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_wmma_i8_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| Ds| EData| AccData| CShuffle| A| B| CDE| ConvForward| GEMM| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| DataType| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//generic instance
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
4
,
16
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
1
>
,
// blocksize=256
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
128
,
128
,
4
,
16
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
64
,
256
,
4
,
16
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
256
,
64
,
4
,
16
,
16
,
16
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
256
,
128
,
128
,
8
,
16
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
// blocksize=128
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
4
,
16
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
64
,
8
,
16
,
16
,
16
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
128
,
4
,
16
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
64
,
128
,
8
,
16
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
128
,
64
,
4
,
16
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
128
,
64
,
8
,
16
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
32
,
256
,
4
,
16
,
16
,
16
,
1
,
8
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
128
,
256
,
32
,
4
,
16
,
16
,
16
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
>
,
// blocksize=64
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
64
,
4
,
16
,
16
,
16
,
1
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
64
,
32
,
4
,
16
,
16
,
16
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
32
,
8
,
16
,
16
,
16
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
64
,
32
,
128
,
4
,
16
,
16
,
16
,
1
,
8
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
2
>
,
8
>
,
// blocksize=32
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
16
,
64
,
4
,
16
,
16
,
16
,
1
,
4
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
64
,
16
,
4
,
16
,
16
,
16
,
4
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
32
,
32
,
4
,
16
,
16
,
16
,
2
,
2
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
I8
,
I8
,
DsDatatype
,
I8
,
I32
,
I8
,
PassThrough
,
PassThrough
,
CDEElementOp
,
ConvSpec
,
GemmMNKPadding
,
32
,
16
,
16
,
4
,
16
,
16
,
16
,
1
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
S
<
2
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
16
,
16
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
2
>
,
8
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
View file @
9f8ab221
...
...
@@ -13,6 +13,10 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
#ifdef CK_ENABLE_FP8
using
F8
=
ck
::
f8_t
;
#endif
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -174,6 +178,42 @@ using device_grouped_conv_fwd_xdl_int8_instances = std::tuple<
// clang-format on
>
;
template
<
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_xdl_f16_comp_f8_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| ComputeType|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| |
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| |
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
#ifdef CK_ENABLE_FP8
// generic instance
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
F8
>
,
// instances for small conv.K and conv.C
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
128
,
256
,
32
,
8
,
8
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
128
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
128
,
128
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
128
,
32
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
F8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
DsLayout
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
32
,
64
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
F8
>
#endif
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
View file @
9f8ab221
...
...
@@ -16,6 +16,7 @@ namespace device {
namespace
instance
{
// conv2d backward data
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
GNHWK
,
...
...
@@ -30,6 +31,35 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances(
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
GNHWK
,
GKYXC
,
Empty_Tuple
,
GNHWC
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
GNHWK
,
GKYXC
,
Empty_Tuple
,
GNHWC
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
GNHWK
,
...
...
@@ -43,7 +73,8 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
GNHWK
,
...
...
@@ -57,7 +88,37 @@ void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
GNHWK
,
GKYXC
,
Empty_Tuple
,
GNHWC
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
GNHWK
,
GKYXC
,
Empty_Tuple
,
GNHWC
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
NHWGK
,
...
...
@@ -72,6 +133,36 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
NHWGK
,
GKYXC
,
Empty_Tuple
,
NHWGC
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
NHWGK
,
GKYXC
,
Empty_Tuple
,
NHWGC
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
NHWGK
,
...
...
@@ -85,7 +176,8 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
NHWGK
,
...
...
@@ -99,8 +191,38 @@ void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
NHWGK
,
GKYXC
,
Empty_Tuple
,
NHWGC
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
2
,
NHWGK
,
GKYXC
,
Empty_Tuple
,
NHWGC
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
// conv3d backward data
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
GNDHWK
,
...
...
@@ -115,6 +237,35 @@ void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances(
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
GNDHWK
,
GKZYXC
,
Empty_Tuple
,
GNDHWC
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
GNDHWK
,
GKZYXC
,
Empty_Tuple
,
GNDHWC
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
GNDHWK
,
...
...
@@ -128,7 +279,8 @@ void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
GNDHWK
,
...
...
@@ -142,7 +294,37 @@ void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
GNDHWK
,
GKZYXC
,
Empty_Tuple
,
GNDHWC
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_i8_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
GNDHWK
,
GKZYXC
,
Empty_Tuple
,
GNDHWC
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
NDHWGK
,
...
...
@@ -157,6 +339,35 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
NDHWGK
,
GKZYXC
,
Empty_Tuple
,
NDHWGC
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
NDHWGK
,
GKZYXC
,
Empty_Tuple
,
NDHWGC
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
NDHWGK
,
...
...
@@ -170,7 +381,8 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
NDHWGK
,
...
...
@@ -184,14 +396,62 @@ void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
NDHWGK
,
GKZYXC
,
Empty_Tuple
,
NDHWGC
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
NDHWGK
,
GKZYXC
,
Empty_Tuple
,
NDHWGC
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
void
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdDataMultipleD
<
3
,
NDHWGK
,
GKZYXC
,
Empty_Tuple
,
NDHWGC
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
BF8
,
F8
>>>&
instances
);
#endif
template
<
ck
::
index_t
NumDimSpatial
,
typename
OutLayout
,
typename
WeiLayout
,
typename
InLayout
,
typename
OutDataType
,
typename
WeiDataType
,
typename
InDataType
>
typename
InDataType
,
typename
ComputeTypeA
,
typename
ComputeTypeB
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvBwdDataMultipleD
<
NumDimSpatial
,
...
...
@@ -205,7 +465,9 @@ struct DeviceOperationInstanceFactory<
InDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>>
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ComputeTypeA
,
ComputeTypeB
>>
{
using
DeviceOp
=
DeviceGroupedConvBwdDataMultipleD
<
NumDimSpatial
,
...
...
@@ -219,7 +481,9 @@ struct DeviceOperationInstanceFactory<
InDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ComputeTypeA
,
ComputeTypeB
>
;
static
auto
GetInstances
()
{
...
...
@@ -230,42 +494,80 @@ struct DeviceOperationInstanceFactory<
if
constexpr
(
is_same_v
<
InLayout
,
GNHWC
>
&&
is_same_v
<
WeiLayout
,
GKYXC
>
&&
is_same_v
<
OutLayout
,
GNHWK
>
)
{
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
F16
>
&&
is_same_v
<
WeiDataType
,
F16
>
&&
is_same_v
<
OutDataType
,
F16
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_f16_1x1s1p0_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
WeiDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
BF16
>
&&
is_same_v
<
WeiDataType
,
BF16
>
&&
is_same_v
<
OutDataType
,
BF16
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_1x1s1p0_instances
(
op_ptrs
);
}
#endif
}
else
if
constexpr
(
is_same_v
<
InLayout
,
NHWGC
>
&&
is_same_v
<
WeiLayout
,
GKYXC
>
&&
is_same_v
<
OutLayout
,
NHWGK
>
)
{
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
F16
>
&&
is_same_v
<
WeiDataType
,
F16
>
&&
is_same_v
<
OutDataType
,
F16
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_f16_1x1s1p0_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
WeiDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
BF16
>
&&
is_same_v
<
WeiDataType
,
BF16
>
&&
is_same_v
<
OutDataType
,
BF16
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_1x1s1p0_instances
(
op_ptrs
);
}
#endif
}
}
else
if
constexpr
(
NumDimSpatial
==
3
)
...
...
@@ -274,46 +576,96 @@ struct DeviceOperationInstanceFactory<
if
constexpr
(
is_same_v
<
InLayout
,
GNDHWC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
GNDHWK
>
)
{
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
F16
>
&&
is_same_v
<
WeiDataType
,
F16
>
&&
is_same_v
<
OutDataType
,
F16
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_f16_1x1s1p0_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
WeiDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
BF16
>
&&
is_same_v
<
WeiDataType
,
BF16
>
&&
is_same_v
<
OutDataType
,
BF16
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_i8_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_i8_1x1s1p0_instances
(
op_ptrs
);
}
#endif
}
else
if
constexpr
(
is_same_v
<
InLayout
,
NDHWGC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWGK
>
)
{
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
F16
>
&&
is_same_v
<
WeiDataType
,
F16
>
&&
is_same_v
<
OutDataType
,
F16
>
)
is_same_v
<
OutDataType
,
F16
>
&&
is_same_v
<
ComputeTypeA
,
F16
>
&&
is_same_v
<
ComputeTypeB
,
F16
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_f16_1x1s1p0_instances
(
op_ptrs
);
}
#endif
#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
else
if
constexpr
(
is_same_v
<
InDataType
,
F16
>
&&
is_same_v
<
WeiDataType
,
F16
>
&&
is_same_v
<
OutDataType
,
F16
>
&&
is_same_v
<
ComputeTypeA
,
bf8_t
>
&&
is_same_v
<
ComputeTypeB
,
f8_t
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_input_f16_comp_bf8f8_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
WeiDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
BF16
>
&&
is_same_v
<
WeiDataType
,
BF16
>
&&
is_same_v
<
OutDataType
,
BF16
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_1x1s1p0_instances
(
op_ptrs
);
}
#endif
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
View file @
9f8ab221
...
...
@@ -19,6 +19,7 @@ namespace instance {
// xdl
// conv1d backward weight
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
1
,
GNWC
,
...
...
@@ -30,7 +31,8 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_insta
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
1
,
GNWC
,
...
...
@@ -42,7 +44,8 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
1
,
GNWC
,
...
...
@@ -54,8 +57,9 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
// conv2d backward weight
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
GNHWC
,
...
...
@@ -67,7 +71,8 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_in
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
GNHWC
,
...
...
@@ -79,7 +84,8 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
GNHWC
,
...
...
@@ -91,7 +97,8 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
...
...
@@ -103,7 +110,8 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_in
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
...
...
@@ -115,7 +123,8 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
...
...
@@ -127,8 +136,9 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
// conv3d backward weight
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
...
...
@@ -140,7 +150,8 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
...
...
@@ -153,6 +164,31 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
GKZYXC
,
GNDHWK
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
GKZYXC
,
GNDHWK
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
...
...
@@ -164,7 +200,33 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
GKZYXC
,
GNDHWK
,
int8_t
,
int8_t
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
GKZYXC
,
GNDHWK
,
int8_t
,
int8_t
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
...
...
@@ -176,7 +238,8 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
...
...
@@ -189,6 +252,31 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
GKZYXC
,
NDHWGK
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
GKZYXC
,
NDHWGK
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
...
...
@@ -200,10 +288,52 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
void
add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
GKZYXC
,
NDHWGK
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
,
BF8
,
F8
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
GKZYXC
,
NDHWGK
,
int8_t
,
int8_t
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
GKZYXC
,
NDHWGK
,
int8_t
,
int8_t
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef DL_KERNELS
// dl
// conv1d backward weight
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
1
,
GNWC
,
...
...
@@ -215,7 +345,8 @@ void add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instan
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
1
,
GNWC
,
...
...
@@ -227,7 +358,8 @@ void add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
1
,
GNWC
,
...
...
@@ -239,7 +371,8 @@ void add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
1
,
NWGC
,
...
...
@@ -251,7 +384,8 @@ void add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instan
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
1
,
NWGC
,
...
...
@@ -263,7 +397,8 @@ void add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
1
,
NWGC
,
...
...
@@ -275,8 +410,9 @@ void add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
// conv2d backward weight
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
GNHWC
,
...
...
@@ -288,7 +424,8 @@ void add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_ins
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
GNHWC
,
...
...
@@ -300,7 +437,8 @@ void add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
GNHWC
,
...
...
@@ -312,7 +450,8 @@ void add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
...
...
@@ -324,7 +463,8 @@ void add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_ins
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
...
...
@@ -336,7 +476,8 @@ void add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
2
,
NHWGC
,
...
...
@@ -348,8 +489,9 @@ void add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
// conv3d backward weight
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
...
...
@@ -361,7 +503,8 @@ void add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
...
...
@@ -373,7 +516,8 @@ void add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
GNDHWC
,
...
...
@@ -385,7 +529,8 @@ void add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
...
...
@@ -397,7 +542,8 @@ void add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
...
...
@@ -409,7 +555,8 @@ void add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvBwdWeight
<
3
,
NDHWGC
,
...
...
@@ -422,6 +569,7 @@ void add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#endif
template
<
ck
::
index_t
NumDimSpatial
,
typename
InLayout
,
...
...
@@ -429,7 +577,9 @@ template <ck::index_t NumDimSpatial,
typename
OutLayout
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
>
typename
OutDataType
,
typename
ComputeTypeA
,
typename
ComputeTypeB
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvBwdWeight
<
NumDimSpatial
,
InLayout
,
...
...
@@ -440,7 +590,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
OutDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>>
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ComputeTypeA
,
ComputeTypeB
>>
{
using
DeviceOp
=
DeviceGroupedConvBwdWeight
<
NumDimSpatial
,
InLayout
,
...
...
@@ -451,7 +603,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
OutDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ComputeTypeA
,
ComputeTypeB
>
;
static
auto
GetInstances
()
{
...
...
@@ -462,6 +616,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
if
constexpr
(
is_same_v
<
InLayout
,
GNWC
>
&&
is_same_v
<
WeiLayout
,
GKXC
>
&&
is_same_v
<
OutLayout
,
GNWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
...
...
@@ -470,17 +625,20 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instances
(
op_ptrs
);
#endif
add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances
(
...
...
@@ -489,36 +647,43 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances
(
op_ptrs
);
}
#endif
}
else
if
constexpr
(
is_same_v
<
InLayout
,
NWGC
>
&&
is_same_v
<
WeiLayout
,
GKXC
>
&&
is_same_v
<
OutLayout
,
NWGK
>
)
if
constexpr
(
is_same_v
<
InLayout
,
NWGC
>
&&
is_same_v
<
WeiLayout
,
GKXC
>
&&
is_same_v
<
OutLayout
,
NWGK
>
)
{
#ifdef DL_KERNELS
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instances
(
op_ptrs
);
}
#endif
#endif
}
}
else
if
constexpr
(
NumDimSpatial
==
2
)
if
constexpr
(
NumDimSpatial
==
2
)
{
if
constexpr
(
is_same_v
<
InLayout
,
GNHWC
>
&&
is_same_v
<
WeiLayout
,
GKYXC
>
&&
is_same_v
<
OutLayout
,
GNHWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
...
...
@@ -529,8 +694,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instances
(
...
...
@@ -539,9 +706,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances
(
...
...
@@ -550,10 +718,12 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances
(
op_ptrs
);
}
#endif
}
else
if
constexpr
(
is_same_v
<
InLayout
,
NHWGC
>
&&
is_same_v
<
WeiLayout
,
GKYXC
>
&&
is_same_v
<
OutLayout
,
NHWGK
>
)
if
constexpr
(
is_same_v
<
InLayout
,
NHWGC
>
&&
is_same_v
<
WeiLayout
,
GKYXC
>
&&
is_same_v
<
OutLayout
,
NHWGK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
...
...
@@ -564,8 +734,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instances
(
...
...
@@ -574,9 +746,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances
(
...
...
@@ -585,13 +758,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances
(
op_ptrs
);
}
#endif
}
}
else
if
constexpr
(
NumDimSpatial
==
3
)
if
constexpr
(
NumDimSpatial
==
3
)
{
if
constexpr
(
is_same_v
<
InLayout
,
GNDHWC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
GNDHWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
...
...
@@ -602,8 +777,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instances
(
...
...
@@ -611,10 +788,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances
(
...
...
@@ -623,10 +805,22 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances
(
op_ptrs
);
}
#endif
}
else
if
constexpr
(
is_same_v
<
InLayout
,
NDHWGC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWGK
>
)
if
constexpr
(
is_same_v
<
InLayout
,
NDHWGC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWGK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
...
...
@@ -637,8 +831,11 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeTypeA
,
half_t
>
&&
is_same_v
<
ComputeTypeB
,
half_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
...
...
@@ -646,10 +843,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances
(
...
...
@@ -658,6 +860,26 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances
(
op_ptrs
);
add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances
(
op_ptrs
);
}
#endif
#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeTypeA
,
bf8_t
>
&&
is_same_v
<
ComputeTypeB
,
f8_t
>
)
{
add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances
(
op_ptrs
);
}
#endif
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
View file @
9f8ab221
...
...
@@ -17,6 +17,7 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
#ifdef CK_ENABLE_BF16
// grouped conv1d forward, GNWC/GKXC/GNWK
void
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
1
,
...
...
@@ -31,7 +32,9 @@ void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
1
,
GNWC
,
...
...
@@ -45,7 +48,9 @@ void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
1
,
GNWC
,
...
...
@@ -59,7 +64,9 @@ void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
1
,
GNWC
,
...
...
@@ -73,7 +80,73 @@ void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
void
add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
...
...
@@ -88,13 +161,436 @@ void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
GNHWK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
// grouped conv2d forward, NHWGC/GKYXC/NHWGK
#ifdef CK_ENABLE_BF16
void
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
// grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
void
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
GNDHWC
,
GKZYXC
,
Empty_Tuple
,
GNDHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_BF16
// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
F16
,
F16
,
Empty_Tuple
,
...
...
@@ -102,27 +598,31 @@ void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
void
add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
G
NHWK
,
F
32
,
F
32
,
N
D
HW
G
K
,
F
16
,
F
16
,
Empty_Tuple
,
F
32
,
F
16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
void
add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
G
NHWK
,
N
D
HW
G
K
,
F16
,
F16
,
Empty_Tuple
,
...
...
@@ -130,21 +630,25 @@ void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
void
add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
G
NHWK
,
F
32
,
F
32
,
N
D
HW
G
K
,
F
16
,
F
16
,
Empty_Tuple
,
F
32
,
F
16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
...
...
@@ -158,12 +662,15 @@ void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances
(
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHW
G
C
,
G
NHWC
,
GKYXC
,
Empty_Tuple
,
NHW
G
K
,
G
NHWK
,
F16
,
F16
,
Empty_Tuple
,
...
...
@@ -171,69 +678,64 @@ void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
void
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances
(
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
GNHWK
,
int8_t
,
int8_t
,
Empty_Tuple
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F32
,
F32
,
F16
,
F16
,
Empty_Tuple
,
F
32
,
F
16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
// grouped conv2d forward, NHWGC/GKYXC/NHWGK
void
add_device_grouped_conv2d_fwd_
xdl_
nhw
g
c_gkyxc_nhw
g
k_
b
f16_instances
(
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv2d_fwd_
wmma_g
nhwc_gkyxc_
g
nhwk_f16_
oddc_
instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHW
G
C
,
G
NHWC
,
GKYXC
,
Empty_Tuple
,
NHW
G
K
,
B
F16
,
B
F16
,
G
NHWK
,
F16
,
F16
,
Empty_Tuple
,
B
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
void
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
#ifdef CK_ENABLE_FP8
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_f8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NHWGK
,
N
D
HWGK
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
,
F8
>>>&
instances
);
#endif
void
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NHWGK
,
N
D
HWGK
,
F32
,
F32
,
Empty_Tuple
,
...
...
@@ -241,56 +743,57 @@ void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
// grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
void
add_device_grouped_conv3d_fwd_xdl_
g
ndhwc_gkzyxc_
g
ndhwk_
bf16
_instances
(
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv3d_fwd_xdl_ndhw
g
c_gkzyxc_ndhw
g
k_
int8
_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
G
NDHWC
,
NDHW
G
C
,
GKZYXC
,
Empty_Tuple
,
G
NDHWK
,
BF16
,
BF16
,
NDHW
G
K
,
int8_t
,
int8_t
,
Empty_Tuple
,
BF16
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_
xdl_g
ndhwc_gkzyxc_
g
ndhwk_
f16
_instances
(
void
add_device_grouped_conv3d_fwd_
wmma_
ndhw
g
c_gkzyxc_ndhw
g
k_
i8
_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
G
NDHWC
,
NDHW
G
C
,
GKZYXC
,
Empty_Tuple
,
G
NDHWK
,
F16
,
F16
,
NDHW
G
K
,
int8_t
,
int8_t
,
Empty_Tuple
,
F16
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_
xdl_g
ndhwc_gkzyxc_
g
ndhwk_
f32
_instances
(
void
add_device_grouped_conv3d_fwd_
wmma_
ndhw
g
c_gkzyxc_ndhw
g
k_
i8_1x1p0
_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
G
NDHWC
,
NDHW
G
C
,
GKZYXC
,
Empty_Tuple
,
G
NDHWK
,
F32
,
F32
,
NDHW
G
K
,
int8_t
,
int8_t
,
Empty_Tuple
,
F32
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_conv3d_fwd_
xdl_g
ndhwc_gkzyxc_
g
ndhwk_i
nt8
_instances
(
void
add_device_grouped_conv3d_fwd_
wmma_
ndhw
g
c_gkzyxc_ndhw
g
k_i
8_1x1s1p0
_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
G
NDHWC
,
NDHW
G
C
,
GKZYXC
,
Empty_Tuple
,
G
NDHWK
,
NDHW
G
K
,
int8_t
,
int8_t
,
Empty_Tuple
,
...
...
@@ -299,27 +802,45 @@ void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(
PassThrough
,
PassThrough
>>>&
instances
);
// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
void
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
BF16
,
BF16
,
int8_t
,
int8_t
,
Empty_Tuple
,
BF16
,
int8_t
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
void
add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NDHWGK
,
NHWGK
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
void
add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
NHWGC
,
GKYXC
,
Empty_Tuple
,
NHWGK
,
F16
,
F16
,
Empty_Tuple
,
...
...
@@ -327,34 +848,39 @@ void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
void
add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
N
D
HW
G
K
,
F
32
,
F
32
,
G
NHWK
,
F
16
,
F
16
,
Empty_Tuple
,
F
32
,
F
16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
void
add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
2
,
GNHWC
,
GKYXC
,
Empty_Tuple
,
N
D
HW
G
K
,
int8_t
,
int8_t
,
G
NHWK
,
F32
,
F32
,
Empty_Tuple
,
int8_t
,
F32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
template
<
ck
::
index_t
NumDimSpatial
,
typename
InLayout
,
...
...
@@ -362,7 +888,8 @@ template <ck::index_t NumDimSpatial,
typename
OutLayout
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
>
typename
OutDataType
,
typename
ComputeType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD
<
NumDimSpatial
,
InLayout
,
...
...
@@ -375,7 +902,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
OutDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>>
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ComputeType
>>
{
using
DeviceOp
=
DeviceGroupedConvFwdMultipleD
<
NumDimSpatial
,
InLayout
,
...
...
@@ -388,7 +916,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
OutDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ComputeType
>
;
static
auto
GetInstances
()
{
...
...
@@ -397,127 +926,245 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
InLayout
,
GNWC
>
&&
is_same_v
<
WeiLayout
,
GKXC
>
&&
is_same_v
<
OutLayout
,
GNWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances
(
op_ptrs
);
}
#endif
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
InLayout
,
GNHWC
>
&&
is_same_v
<
WeiLayout
,
GKYXC
>
&&
is_same_v
<
OutLayout
,
GNHWK
>
)
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
InLayout
,
GNHWC
>
&&
is_same_v
<
WeiLayout
,
GKYXC
>
&&
is_same_v
<
OutLayout
,
GNHWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances
(
op_ptrs
);
}
#endif
#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instances
(
op_ptrs
);
}
#endif
#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instances
(
op_ptrs
);
}
#endif
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
InLayout
,
NHWGC
>
&&
is_same_v
<
WeiLayout
,
GKYXC
>
&&
is_same_v
<
OutLayout
,
NHWGK
>
)
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
InLayout
,
NHWGC
>
&&
is_same_v
<
WeiLayout
,
GKYXC
>
&&
is_same_v
<
OutLayout
,
NHWGK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances
(
op_ptrs
);
}
#endif
#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances
(
op_ptrs
);
}
#endif
#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instances
(
op_ptrs
);
}
#endif
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
GNDHWC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
GNDHWK
>
)
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
GNDHWC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
GNDHWK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instances
(
op_ptrs
);
}
#endif
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
NDHWGC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWGK
>
)
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
NDHWGC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWGK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
#endif
#ifdef CK_ENABLE_FP8
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
ck
::
f8_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_f8_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances
(
op_ptrs
);
}
#endif
}
return
op_ptrs
;
...
...
library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp
deleted
100644 → 0
View file @
755ace59
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// nhwc, 1d
void
add_device_image_to_column_nhwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
BF16
,
BF16
>>>&
instances
);
void
add_device_image_to_column_nhwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
F16
,
F16
>>>&
instances
);
void
add_device_image_to_column_nhwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
F32
,
F32
>>>&
instances
);
void
add_device_image_to_column_nhwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
1
,
GNWC
,
int8_t
,
int8_t
>>>&
instances
);
// nhwc, 2d
void
add_device_image_to_column_nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
BF16
,
BF16
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
F16
,
F16
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
F32
,
F32
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
2
,
GNHWC
,
int8_t
,
int8_t
>>>&
instances
);
// nhwc, 3d
void
add_device_image_to_column_nhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
BF16
,
BF16
>>>&
instances
);
void
add_device_image_to_column_nhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
F16
,
F16
>>>&
instances
);
void
add_device_image_to_column_nhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
F32
,
F32
>>>&
instances
);
void
add_device_image_to_column_nhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceImageToColumn
<
3
,
GNDHWC
,
int8_t
,
int8_t
>>>&
instances
);
template
<
ck
::
index_t
NumDimSpatial
,
typename
InLayout
,
typename
InDataType
,
typename
OutDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceImageToColumn
<
NumDimSpatial
,
InLayout
,
InDataType
,
OutDataType
>>
{
using
DeviceOp
=
DeviceImageToColumn
<
NumDimSpatial
,
InLayout
,
InDataType
,
OutDataType
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
InLayout
,
GNWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
InLayout
,
GNHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
GNDHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_3d_i8_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
…
7
8
9
10
11
12
13
14
15
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment