Commit cfcdb03e authored by Bartlomiej Wroblewski's avatar Bartlomiej Wroblewski
Browse files

Merge remote-tracking branch 'origin/develop' into bwroblew/contraction_mixed_dt

parents a30c626b e2243a4d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_f16_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwdOddC>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_i8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwd1x1P0>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_i8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwd1x1S1P0>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_i8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwdDefault>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_i8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwdOddC>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck { namespace ck {
...@@ -9,28 +9,50 @@ namespace tensor_operation { ...@@ -9,28 +9,50 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_image_to_column_nhwc_1d_bf16_instances( using namespace ck::conv_tensor_rearrange_op;
std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, BF16, BF16>>>& instances)
void add_device_image_to_column_nwc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_image_to_column_bf16_instances<1, GNWC>{}); add_device_operation_instances(instances, device_image_to_column_bf16_instances<1, GNWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_1d_f16_instances( void add_device_image_to_column_nwc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F16, F16>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_image_to_column_f16_instances<1, GNWC>{}); add_device_operation_instances(instances, device_image_to_column_f16_instances<1, GNWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_1d_f32_instances( void add_device_image_to_column_nwc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F32, F32>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_image_to_column_f32_instances<1, GNWC>{}); add_device_operation_instances(instances, device_image_to_column_f32_instances<1, GNWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_1d_i8_instances( void add_device_image_to_column_nwc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, int8_t, int8_t>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_image_to_column_i8_instances<1, GNWC>{}); add_device_operation_instances(instances, device_image_to_column_i8_instances<1, GNWC>{});
#else
ignore = instances;
#endif
} }
} // namespace instance } // namespace instance
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck { namespace ck {
...@@ -9,28 +9,51 @@ namespace tensor_operation { ...@@ -9,28 +9,51 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
using namespace ck::conv_tensor_rearrange_op;
void add_device_image_to_column_nhwc_2d_bf16_instances( void add_device_image_to_column_nhwc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, BF16, BF16>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_image_to_column_bf16_instances<2, GNHWC>{}); add_device_operation_instances(instances, device_image_to_column_bf16_instances<2, GNHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_2d_f16_instances( void add_device_image_to_column_nhwc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F16, F16>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_image_to_column_f16_instances<2, GNHWC>{}); add_device_operation_instances(instances, device_image_to_column_f16_instances<2, GNHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_2d_f32_instances( void add_device_image_to_column_nhwc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F32, F32>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_image_to_column_f32_instances<2, GNHWC>{}); add_device_operation_instances(instances, device_image_to_column_f32_instances<2, GNHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_2d_i8_instances( void add_device_image_to_column_nhwc_2d_i8_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, int8_t, int8_t>>>& instances) std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_image_to_column_i8_instances<2, GNHWC>{}); add_device_operation_instances(instances, device_image_to_column_i8_instances<2, GNHWC>{});
#else
ignore = instances;
#endif
} }
} // namespace instance } // namespace instance
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck { namespace ck {
...@@ -9,28 +9,51 @@ namespace tensor_operation { ...@@ -9,28 +9,51 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_image_to_column_nhwc_3d_bf16_instances( using namespace ck::conv_tensor_rearrange_op;
std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, BF16, BF16>>>& instances)
void add_device_image_to_column_ndhwc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_image_to_column_bf16_instances<3, GNDHWC>{}); add_device_operation_instances(instances, device_image_to_column_bf16_instances<3, GNDHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_3d_f16_instances( void add_device_image_to_column_ndhwc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F16, F16>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_image_to_column_f16_instances<3, GNDHWC>{}); add_device_operation_instances(instances, device_image_to_column_f16_instances<3, GNDHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_3d_f32_instances( void add_device_image_to_column_ndhwc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F32, F32>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_image_to_column_f32_instances<3, GNDHWC>{}); add_device_operation_instances(instances, device_image_to_column_f32_instances<3, GNDHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_3d_i8_instances( void add_device_image_to_column_ndhwc_3d_i8_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, int8_t, int8_t>>>& instances) std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_image_to_column_i8_instances<3, GNDHWC>{}); add_device_operation_instances(instances, device_image_to_column_i8_instances<3, GNDHWC>{});
#else
ignore = instances;
#endif
} }
} // namespace instance } // namespace instance
......
...@@ -187,7 +187,7 @@ GB/s: 69.2301 ...@@ -187,7 +187,7 @@ GB/s: 69.2301
``` ```
Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time. Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
## Profile image to column kernels ## Profile image to column/column to image kernels
```bash ```bash
# arg1: tensor operation (" OP_NAME ": " OP_DESC ") # arg1: tensor operation (" OP_NAME ": " OP_DESC ")
# arg2: data type (0: Input fp32, Weight fp32, Output fp32 # arg2: data type (0: Input fp32, Weight fp32, Output fp32
...@@ -199,6 +199,7 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate ...@@ -199,6 +199,7 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
# arg5: initialization (0: no init, 1: integer value, 2: decimal value) # arg5: initialization (0: no init, 1: integer value, 2: decimal value)
# arg6: print tensor value (0: no; 1: yes) # arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes) # arg7: time kernel (0: no, 1: yes)
# arg8: operation type (0: ImageToColumn, 1: ColumnToImage)
# Following arguments (depending on number of spatial dims): # Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d) # Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
# G, N, K, C, # G, N, K, C,
...@@ -209,8 +210,8 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate ...@@ -209,8 +210,8 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
# <left padding>, (ie LeftPy, LeftPx for 2D) # <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D) # <right padding>, (ie RightPy, RightPx for 2D)
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx ################ op datatype layout verify init log time opType Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
./bin/ckProfiler image_to_column 0 0 1 1 0 1 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0 ./bin/ckProfiler conv_tensor_rearrange 0 0 0 1 0 1 0 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0
``` ```
...@@ -224,3 +225,4 @@ name: DeviceImageToColumn<128, 32, 64, 4> ...@@ -224,3 +225,4 @@ name: DeviceImageToColumn<128, 32, 64, 4>
avg_time: 3.12326 avg_time: 3.12326
GB/s: 2042.59 GB/s: 2042.59
``` ```
Note: Column to image kernel adds to the output memory, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
...@@ -9,9 +9,11 @@ ...@@ -9,9 +9,11 @@
#include <limits> #include <limits>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp" #include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp" #include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -19,22 +21,88 @@ ...@@ -19,22 +21,88 @@
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
using namespace conv_tensor_rearrange_op;
template <typename InputDataType, typename ConvTensorRearrangeOp>
Tensor<InputDataType> create_input(const HostTensorDescriptor& image_desc,
const HostTensorDescriptor& gemm_desc)
{
if constexpr(std::is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
{
Tensor<InputDataType> input(image_desc);
return input;
}
else if constexpr(std::is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
{
Tensor<InputDataType> input(gemm_desc);
return input;
}
else
{
throw std::runtime_error("Unsupported op!");
}
}
template <typename OutputDataType, typename ConvTensorRearrangeOp>
Tensor<OutputDataType> create_output(const HostTensorDescriptor& image_desc,
const HostTensorDescriptor& gemm_desc)
{
if constexpr(std::is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
{
Tensor<OutputDataType> output(gemm_desc);
return output;
}
else if constexpr(std::is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
{
Tensor<OutputDataType> output(image_desc);
return output;
}
else
{
throw std::runtime_error("Unsupported op!");
}
}
template <index_t NDimSpatial,
typename InputLayout,
typename InputDataType,
typename OutputDataType,
typename ConvTensorRearrangeOp>
static auto make_ref_op()
{
if constexpr(std::is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
{
return ck::tensor_operation::host::
ReferenceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>{};
}
else if constexpr(std::is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
{
return ck::tensor_operation::host::
ReferenceColumnToImage<NDimSpatial, InputLayout, InputDataType, OutputDataType>{};
}
else
{
throw std::runtime_error("Unsupported op!");
}
}
template <index_t NDimSpatial, template <index_t NDimSpatial,
typename InputLayout, typename InputLayout,
typename InputDataType, typename InputDataType,
typename OutputDataType> typename OutputDataType,
bool profile_image_to_column_impl(int do_verification, typename ConvTensorRearrangeOp>
int init_method, bool profile_conv_tensor_rearrange_impl(int do_verification,
bool do_log, int init_method,
bool time_kernel, bool do_log,
const ck::utils::conv::ConvParam& conv_param) bool time_kernel,
const ck::utils::conv::ConvParam& conv_param)
{ {
const ck::index_t NDoHoWo = const ck::index_t NDoHoWo =
conv_param.N_ * conv_param.N_ *
...@@ -45,16 +113,16 @@ bool profile_image_to_column_impl(int do_verification, ...@@ -45,16 +113,16 @@ bool profile_image_to_column_impl(int do_verification,
ck::accumulate_n<ck::index_t>( ck::accumulate_n<ck::index_t>(
conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
const auto in_desc = const auto image_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InputLayout>( ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InputLayout>(
conv_param); conv_param);
const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX}); const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX});
std::array<ck::index_t, NDimSpatial> input_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
std::array<ck::index_t, NDimSpatial> output_spatial_lengths{}; std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{}; std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
std::array<ck::index_t, 2> output_m_k_strides{}; std::array<ck::index_t, 2> gemm_m_k_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}; std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{}; std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
...@@ -65,16 +133,19 @@ bool profile_image_to_column_impl(int do_verification, ...@@ -65,16 +133,19 @@ bool profile_image_to_column_impl(int do_verification,
copy(conv_param.input_spatial_lengths_, input_spatial_lengths); copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
copy(conv_param.output_spatial_lengths_, output_spatial_lengths); copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
copy(in_desc.GetStrides(), input_g_n_c_wis_strides); copy(image_desc.GetStrides(), image_g_n_c_wis_strides);
copy(out_desc.GetStrides(), output_m_k_strides); copy(gemm_desc.GetStrides(), gemm_m_k_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides); copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations); copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads); copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads); copy(conv_param.input_right_pads_, input_right_pads);
Tensor<InputDataType> input(in_desc); Tensor<InputDataType> input =
Tensor<OutputDataType> host_output(out_desc); create_input<InputDataType, ConvTensorRearrangeOp>(image_desc, gemm_desc);
Tensor<OutputDataType> device_output(out_desc); Tensor<OutputDataType> device_output =
create_output<OutputDataType, ConvTensorRearrangeOp>(image_desc, gemm_desc);
Tensor<OutputDataType> host_output =
create_output<OutputDataType, ConvTensorRearrangeOp>(image_desc, gemm_desc);
std::cout << "input: " << input.mDesc << std::endl; std::cout << "input: " << input.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl; std::cout << "output: " << host_output.mDesc << std::endl;
...@@ -94,17 +165,21 @@ bool profile_image_to_column_impl(int do_verification, ...@@ -94,17 +165,21 @@ bool profile_image_to_column_impl(int do_verification,
// run reference op // run reference op
if(do_verification) if(do_verification)
{ {
auto ref_image_to_column = ck::tensor_operation::host:: auto ref_conv_tensor_rearrange = make_ref_op<NDimSpatial,
ReferenceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>{}; InputLayout,
InputDataType,
OutputDataType,
ConvTensorRearrangeOp>();
auto ref_invoker = ref_image_to_column.MakeInvoker(); auto ref_invoker = ref_conv_tensor_rearrange.MakeInvoker();
auto ref_argument = ref_image_to_column.MakeArgument(input, auto ref_argument =
host_output, ref_conv_tensor_rearrange.MakeArgument(input,
conv_param.filter_spatial_lengths_, host_output,
conv_param.conv_filter_strides_, conv_param.filter_spatial_lengths_,
conv_param.conv_filter_dilations_, conv_param.conv_filter_strides_,
conv_param.input_left_pads_, conv_param.conv_filter_dilations_,
conv_param.input_right_pads_); conv_param.input_left_pads_,
conv_param.input_right_pads_);
// init host output to zero // init host output to zero
host_output.SetZero(); host_output.SetZero();
...@@ -112,8 +187,11 @@ bool profile_image_to_column_impl(int do_verification, ...@@ -112,8 +187,11 @@ bool profile_image_to_column_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
using DeviceOp = ck::tensor_operation::device:: using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange<NDimSpatial,
DeviceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>; InputLayout,
InputDataType,
OutputDataType,
ConvTensorRearrangeOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -139,8 +217,8 @@ bool profile_image_to_column_impl(int do_verification, ...@@ -139,8 +217,8 @@ bool profile_image_to_column_impl(int do_verification,
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
input_g_n_c_wis_strides, image_g_n_c_wis_strides,
output_m_k_strides, gemm_m_k_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
......
...@@ -223,6 +223,12 @@ int profile_gemm_impl(int do_verification, ...@@ -223,6 +223,12 @@ int profile_gemm_impl(int do_verification,
{ {
std::cout << "Best Perf for datatype = int8"; std::cout << "Best Perf for datatype = int8";
} }
#if defined CK_ENABLE_FP8
else if constexpr(is_same<CDataType, f8_t>::value)
{
std::cout << "Best Perf for datatype = fp8";
}
#endif
if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value) if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
{ {
......
...@@ -28,7 +28,7 @@ set(PROFILER_SOURCES ...@@ -28,7 +28,7 @@ set(PROFILER_SOURCES
profile_contraction_bilinear.cpp profile_contraction_bilinear.cpp
profile_contraction_scale.cpp profile_contraction_scale.cpp
profile_grouped_conv_bwd_data.cpp profile_grouped_conv_bwd_data.cpp
profile_image_to_column.cpp profile_conv_tensor_rearrange.cpp
) )
if(DL_KERNELS) if(DL_KERNELS)
list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
...@@ -84,6 +84,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instanc ...@@ -84,6 +84,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instanc
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
if(DL_KERNELS) if(DL_KERNELS)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
endif() endif()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment