Commit b43ac5ef authored by Jing Zhang's avatar Jing Zhang
Browse files

merge conflict

parents b892a14a bc1108bb
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_f16_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwd1x1S1P0>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_f16_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwdDefault>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
F16,
F16,
Empty_Tuple,
F16,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_f16_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwdOddC>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_i8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwd1x1P0>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_i8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwd1x1S1P0>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_i8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwdDefault>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// Compilation parameters for in[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = out[n, do, ho, wo,
// g, k]
void add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
int8_t,
int8_t,
Empty_Tuple,
int8_t,
PassThrough,
PassThrough,
PassThrough>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_wmma_i8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
Empty_Tuple,
PassThrough,
ConvFwdOddC>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck { namespace ck {
...@@ -9,28 +9,50 @@ namespace tensor_operation { ...@@ -9,28 +9,50 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_image_to_column_nhwc_1d_bf16_instances( using namespace ck::conv_tensor_rearrange_op;
std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, BF16, BF16>>>& instances)
void add_device_image_to_column_nwc_1d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_image_to_column_bf16_instances<1, GNWC>{}); add_device_operation_instances(instances, device_image_to_column_bf16_instances<1, GNWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_1d_f16_instances( void add_device_image_to_column_nwc_1d_f16_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F16, F16>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_image_to_column_f16_instances<1, GNWC>{}); add_device_operation_instances(instances, device_image_to_column_f16_instances<1, GNWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_1d_f32_instances( void add_device_image_to_column_nwc_1d_f32_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F32, F32>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_image_to_column_f32_instances<1, GNWC>{}); add_device_operation_instances(instances, device_image_to_column_f32_instances<1, GNWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_1d_i8_instances( void add_device_image_to_column_nwc_1d_i8_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, int8_t, int8_t>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_image_to_column_i8_instances<1, GNWC>{}); add_device_operation_instances(instances, device_image_to_column_i8_instances<1, GNWC>{});
#else
ignore = instances;
#endif
} }
} // namespace instance } // namespace instance
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck { namespace ck {
...@@ -9,28 +9,51 @@ namespace tensor_operation { ...@@ -9,28 +9,51 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
using namespace ck::conv_tensor_rearrange_op;
void add_device_image_to_column_nhwc_2d_bf16_instances( void add_device_image_to_column_nhwc_2d_bf16_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, BF16, BF16>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_image_to_column_bf16_instances<2, GNHWC>{}); add_device_operation_instances(instances, device_image_to_column_bf16_instances<2, GNHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_2d_f16_instances( void add_device_image_to_column_nhwc_2d_f16_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F16, F16>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_image_to_column_f16_instances<2, GNHWC>{}); add_device_operation_instances(instances, device_image_to_column_f16_instances<2, GNHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_2d_f32_instances( void add_device_image_to_column_nhwc_2d_f32_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F32, F32>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_image_to_column_f32_instances<2, GNHWC>{}); add_device_operation_instances(instances, device_image_to_column_f32_instances<2, GNHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_2d_i8_instances( void add_device_image_to_column_nhwc_2d_i8_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, int8_t, int8_t>>>& instances) std::vector<
std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_image_to_column_i8_instances<2, GNHWC>{}); add_device_operation_instances(instances, device_image_to_column_i8_instances<2, GNHWC>{});
#else
ignore = instances;
#endif
} }
} // namespace instance } // namespace instance
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck { namespace ck {
...@@ -9,28 +9,51 @@ namespace tensor_operation { ...@@ -9,28 +9,51 @@ namespace tensor_operation {
namespace device { namespace device {
namespace instance { namespace instance {
void add_device_image_to_column_nhwc_3d_bf16_instances( using namespace ck::conv_tensor_rearrange_op;
std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, BF16, BF16>>>& instances)
void add_device_image_to_column_ndhwc_3d_bf16_instances(
std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_BF16
add_device_operation_instances(instances, device_image_to_column_bf16_instances<3, GNDHWC>{}); add_device_operation_instances(instances, device_image_to_column_bf16_instances<3, GNDHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_3d_f16_instances( void add_device_image_to_column_ndhwc_3d_f16_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F16, F16>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP16
add_device_operation_instances(instances, device_image_to_column_f16_instances<3, GNDHWC>{}); add_device_operation_instances(instances, device_image_to_column_f16_instances<3, GNDHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_3d_f32_instances( void add_device_image_to_column_ndhwc_3d_f32_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F32, F32>>>& instances) std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_FP32
add_device_operation_instances(instances, device_image_to_column_f32_instances<3, GNDHWC>{}); add_device_operation_instances(instances, device_image_to_column_f32_instances<3, GNDHWC>{});
#else
ignore = instances;
#endif
} }
void add_device_image_to_column_nhwc_3d_i8_instances( void add_device_image_to_column_ndhwc_3d_i8_instances(
std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, int8_t, int8_t>>>& instances) std::vector<
std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
instances)
{ {
#ifdef CK_ENABLE_INT8
add_device_operation_instances(instances, device_image_to_column_i8_instances<3, GNDHWC>{}); add_device_operation_instances(instances, device_image_to_column_i8_instances<3, GNDHWC>{});
#else
ignore = instances;
#endif
} }
} // namespace instance } // namespace instance
......
...@@ -185,7 +185,7 @@ GB/s: 69.2301 ...@@ -185,7 +185,7 @@ GB/s: 69.2301
``` ```
Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time. Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
## Profile image to column kernels ## Profile image to column/column to image kernels
```bash ```bash
# arg1: tensor operation (" OP_NAME ": " OP_DESC ") # arg1: tensor operation (" OP_NAME ": " OP_DESC ")
# arg2: data type (0: Input fp32, Weight fp32, Output fp32 # arg2: data type (0: Input fp32, Weight fp32, Output fp32
...@@ -197,6 +197,7 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate ...@@ -197,6 +197,7 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
# arg5: initialization (0: no init, 1: integer value, 2: decimal value) # arg5: initialization (0: no init, 1: integer value, 2: decimal value)
# arg6: print tensor value (0: no; 1: yes) # arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes) # arg7: time kernel (0: no, 1: yes)
# arg8: operation type (0: ImageToColumn, 1: ColumnToImage)
# Following arguments (depending on number of spatial dims): # Following arguments (depending on number of spatial dims):
# Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d) # Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
# G, N, K, C, # G, N, K, C,
...@@ -207,8 +208,8 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate ...@@ -207,8 +208,8 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
# <left padding>, (ie LeftPy, LeftPx for 2D) # <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D) # <right padding>, (ie RightPy, RightPx for 2D)
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx ################ op datatype layout verify init log time opType Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
./bin/ckProfiler image_to_column 0 0 1 1 0 1 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0 ./bin/ckProfiler conv_tensor_rearrange 0 0 0 1 0 1 0 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0
``` ```
...@@ -222,3 +223,4 @@ name: DeviceImageToColumn<128, 32, 64, 4> ...@@ -222,3 +223,4 @@ name: DeviceImageToColumn<128, 32, 64, 4>
avg_time: 3.12326 avg_time: 3.12326
GB/s: 2042.59 GB/s: 2042.59
``` ```
Note: Column to image kernel adds to the output memory, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment