Commit cba8f7f2 authored by Anthony Chang's avatar Anthony Chang
Browse files

Merge remote-tracking branch 'upstream/develop' into gemm-layernorm-4

parents cc50b687 b653c5eb
#ifndef DEVICE_CONV2D_WRW_XDL_C_SHUFFLE_NHWC_KYXC_NHWK_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV2D_WRW_XDL_C_SHUFFLE_NHWC_KYXC_NHWK_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_conv_backward_weight.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "convolution_forward_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
#include "gridwise_gemm_xdlops_bwd_weight.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -773,4 +777,3 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -773,4 +777,3 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_CONV2D_BWD_DATA_XDL_NHWC_KYXC_NHWK_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV2D_BWD_DATA_XDL_NHWC_KYXC_NHWK_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_conv_bwd_data.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "convolution_backward_data_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp" #include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -821,4 +824,3 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K ...@@ -821,4 +824,3 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_CONV2D_FWD_XDL_C_SHUFFLE_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV2D_FWD_XDL_C_SHUFFLE_BIAS_ACTIVATION_ADD_NHWC_KYXC_NHWK_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_conv_fwd_bias_activation_add.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "convolution_forward_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp"
#include "gridwise_gemm_xdlops_v3r3.hpp" #include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -963,4 +966,3 @@ struct ...@@ -963,4 +966,3 @@ struct
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp" #include <vector>
#include "device_base.hpp"
#include "device_conv_fwd_bias_activation.hpp" #include "ck/utility/common_header.hpp"
#include "convolution_forward_specialization.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "common_header.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "gridwise_gemm_xdlops_v3r2.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
#ifndef DEVICE_CONV2D_FWD_XDL_C_SHUFFLE_NHWC_KYXC_NHWK_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV2D_FWD_XDL_C_SHUFFLE_NHWC_KYXC_NHWK_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_conv_fwd.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "convolution_forward_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp"
#include "gridwise_gemm_xdlops_v3r1.hpp" #include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -879,4 +882,3 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W ...@@ -879,4 +882,3 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_CONV2D_FWD_XDL_NHWC_KYXC_NHWK_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV2D_FWD_XDL_NHWC_KYXC_NHWK_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_conv_fwd.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "convolution_forward_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp" #include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -714,9 +717,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K ...@@ -714,9 +717,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
return str.str(); return str.str();
} }
}; // namespace device };
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef DEVICE_CONV3D_FWD_NAIVE_HPP #ifndef DEVICE_CONV3D_FWD_NAIVE_HPP
#define DEVICE_CONV3D_FWD_NAIVE_HPP #define DEVICE_CONV3D_FWD_NAIVE_HPP
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef DEVICE_CONV3D_FWD_XDL_HPP #ifndef DEVICE_CONV3D_FWD_XDL_HPP
#define DEVICE_CONV3D_FWD_XDL_HPP #define DEVICE_CONV3D_FWD_XDL_HPP
......
#ifndef DEVICE_CONV_WRW_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV_WRW_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <iostream> #include <iostream>
#include "device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -44,4 +48,3 @@ using DeviceConvBwdWeightPtr = std::unique_ptr< ...@@ -44,4 +48,3 @@ using DeviceConvBwdWeightPtr = std::unique_ptr<
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_CONV_BWD_DATA_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV_BWD_DATA_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <iostream> #include <iostream>
#include "device_base.hpp"
#include "element_wise_operation.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -44,4 +48,3 @@ using DeviceConvBwdDataPtr = std::unique_ptr< ...@@ -44,4 +48,3 @@ using DeviceConvBwdDataPtr = std::unique_ptr<
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_CONV_FWD_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV_FWD_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include "device_base.hpp" #include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -43,4 +47,3 @@ using DeviceConvFwdPtr = std::unique_ptr< ...@@ -43,4 +47,3 @@ using DeviceConvFwdPtr = std::unique_ptr<
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_CONV_FWD_BIAS_ACTIVATION_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV_FWD_BIAS_ACTIVATION_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <iostream> #include <iostream>
#include "device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -46,4 +51,3 @@ using DeviceConvFwdBiasActivationPtr = ...@@ -46,4 +51,3 @@ using DeviceConvFwdBiasActivationPtr =
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
#ifndef DEVICE_CONV_FWD_BIAS_ACTIVATION_ADD_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONV_FWD_BIAS_ACTIVATION_ADD_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <iostream> #include <iostream>
#include "device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -47,4 +51,3 @@ using DeviceConvFwdBiasActivationAddPtr = ...@@ -47,4 +51,3 @@ using DeviceConvFwdBiasActivationAddPtr =
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_conv_backward_weight.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "convolution_backward_weight_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
#include "gridwise_gemm_xdlops_bwd_weight.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp"
#include "gridwise_unary_elementwise_1d.hpp" #include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -433,7 +437,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -433,7 +437,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
using namespace ck; using namespace ck;
const index_t Di = input_spatial_lengths[0]; const index_t Di = input_spatial_lengths[0];
const index_t Hi = input_spatial_lengths[2]; const index_t Hi = input_spatial_lengths[1];
const index_t Wi = input_spatial_lengths[2]; const index_t Wi = input_spatial_lengths[2];
const index_t Do = output_spatial_lengths[0]; const index_t Do = output_spatial_lengths[0];
...@@ -671,11 +675,14 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -671,11 +675,14 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
return PadDescriptor_M0_1d(desc, gridSize, blockSize); return PadDescriptor_M0_1d(desc, gridSize, blockSize);
} }
using TypeConvertFunctor = using TypeConvertFp32ToBf16Functor =
ck::tensor_operation::element_wise::UnaryTypeConvert<ck::bhalf_t, float>; ck::tensor_operation::element_wise::UnaryTypeConvert<ck::bhalf_t, float>;
using GridDesc_M0 = decltype(MakeDescriptor_M0<1>({1}, {1}, 1, 1)); using GridDesc_M0 = decltype(MakeDescriptor_M0<1>({1}, {1}, 1, 1));
using GridwiseUEltwise = using GridwiseUEltwise = GridwiseUnaryElementwise_1D<AccDataType,
GridwiseUnaryElementwise_1D<AccDataType, InDataType, GridDesc_M0, TypeConvertFunctor, 4>; InDataType,
GridDesc_M0,
TypeConvertFp32ToBf16Functor,
4>;
using ABCGridDescs = decltype(GetABCGridDesc<NumDimSpatial>()); using ABCGridDescs = decltype(GetABCGridDesc<NumDimSpatial>());
...@@ -979,33 +986,32 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -979,33 +986,32 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1); const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
float ave_time = 0; float ave_time = 0;
const auto Run = [&](const auto& kernel) { const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
const auto run_conv = [&](const auto& kernel) {
hipGetErrorString(hipMemset( hipGetErrorString(hipMemset(
arg.p_c_grid_, arg.p_c_grid_,
0, 0,
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
sizeof(CDataType))); sizeof(CDataType)));
ave_time = return launch_and_time_kernel(stream_config,
launch_and_time_kernel(stream_config, kernel,
kernel, dim3(grid_size),
dim3(grid_size), dim3(BlockSize),
dim3(BlockSize), 0,
0, arg.p_a_grid_,
arg.p_a_grid_, arg.p_b_grid_,
arg.p_b_grid_, arg.p_c_grid_,
arg.p_c_grid_, arg.a_grid_desc_kbatch_k0_m_k1_,
arg.a_grid_desc_kbatch_k0_m_k1_, arg.b_grid_desc_kbatch_k0_n_k1_,
arg.b_grid_desc_kbatch_k0_n_k1_, arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, arg.a_element_op_,
arg.a_element_op_, arg.b_element_op_,
arg.b_element_op_, arg.c_element_op_,
arg.c_element_op_, arg.block_2_ctile_map_);
arg.block_2_ctile_map_);
}; };
// run kernel for bf16 with splitk // run kernel for bf16 with splitk
...@@ -1016,22 +1022,21 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -1016,22 +1022,21 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() * arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
sizeof(AccDataType))); sizeof(AccDataType)));
ave_time = return launch_and_time_kernel(stream_config,
launch_and_time_kernel(stream_config, kernel,
kernel, dim3(grid_size),
dim3(grid_size), dim3(BlockSize),
dim3(BlockSize), 0,
0, arg.p_a_grid_,
arg.p_a_grid_, arg.p_b_grid_,
arg.p_b_grid_, static_cast<AccDataType*>(arg.p_workspace_),
static_cast<AccDataType*>(arg.p_workspace_), arg.a_grid_desc_kbatch_k0_m_k1_,
arg.a_grid_desc_kbatch_k0_m_k1_, arg.b_grid_desc_kbatch_k0_n_k1_,
arg.b_grid_desc_kbatch_k0_n_k1_, arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, arg.a_element_op_,
arg.a_element_op_, arg.b_element_op_,
arg.b_element_op_, arg.c_element_op_,
arg.c_element_op_, arg.block_2_ctile_map_);
arg.block_2_ctile_map_);
}; };
// kernel for type conversion // kernel for type conversion
...@@ -1059,7 +1064,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -1059,7 +1064,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
// run kernel for type conversion // run kernel for type conversion
void* p_c_grid_tmp_ = static_cast<void*>(arg.p_c_grid_); void* p_c_grid_tmp_ = static_cast<void*>(arg.p_c_grid_);
InDataType* p_c_grid_tmp_bf16_ = static_cast<InDataType*>(p_c_grid_tmp_); InDataType* p_c_grid_tmp_bf16_ = static_cast<InDataType*>(p_c_grid_tmp_);
const auto Run_type_convert = [&](const auto& kernel) { const auto run_type_convert = [&](const auto& kernel) {
float elapsed_time = float elapsed_time =
launch_and_time_kernel(stream_config, launch_and_time_kernel(stream_config,
kernel, kernel,
...@@ -1070,14 +1075,15 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -1070,14 +1075,15 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
p_c_grid_tmp_bf16_, p_c_grid_tmp_bf16_,
a_grid_desc_m0_, a_grid_desc_m0_,
b_grid_desc_m0_, b_grid_desc_m0_,
TypeConvertFunctor{}); TypeConvertFp32ToBf16Functor{});
return elapsed_time; return elapsed_time;
}; };
if constexpr(std::is_same<InDataType, ck::bhalf_t>::value) if constexpr(std::is_same<InDataType, ck::bhalf_t>::value)
{ {
if(has_main_k0_block_loop) auto launch_kernel = [&](auto has_main_k_block_loop) {
{ constexpr bool has_main_loop = has_main_k_block_loop.value;
if(kbatch == 1) if(kbatch == 1)
{ {
const auto kernel = kernel_gemm_xdlops_bwd_weight< const auto kernel = kernel_gemm_xdlops_bwd_weight<
...@@ -1092,9 +1098,9 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -1092,9 +1098,9 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
InElementwiseOperation, InElementwiseOperation,
WeiElementwiseOperation, WeiElementwiseOperation,
remove_reference_t<DeviceOp::Block2CTileMap>, remove_reference_t<DeviceOp::Block2CTileMap>,
true>; has_main_loop>;
Run(kernel); return run_conv(kernel);
} }
else else
{ {
...@@ -1103,7 +1109,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -1103,7 +1109,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
AccDataType, AccDataType,
InDataType, InDataType,
GridDesc_M0, GridDesc_M0,
TypeConvertFunctor>; TypeConvertFp32ToBf16Functor>;
const auto kernel_conv = kernel_gemm_xdlops_bwd_weight< const auto kernel_conv = kernel_gemm_xdlops_bwd_weight<
GridwiseGemmAtomicAddFloatBf16Splitk, GridwiseGemmAtomicAddFloatBf16Splitk,
...@@ -1117,56 +1123,28 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -1117,56 +1123,28 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
InElementwiseOperation, InElementwiseOperation,
WeiElementwiseOperation, WeiElementwiseOperation,
remove_reference_t<DeviceOp::Block2CTileMap>, remove_reference_t<DeviceOp::Block2CTileMap>,
true>; has_main_loop>;
run_bf16_splitk(kernel_conv); float elapsed_time = 0;
ave_time += Run_type_convert(kernel_type_convert); elapsed_time += run_bf16_splitk(kernel_conv);
elapsed_time += run_type_convert(kernel_type_convert);
return elapsed_time;
} }
};
if(has_main_k0_block_loop)
{
ave_time = launch_kernel(integral_constant<bool, true>{});
} }
else else
{ {
if(kbatch == 1) ave_time = launch_kernel(integral_constant<bool, false>{});
{
const auto kernel = kernel_gemm_xdlops_bwd_weight<
GridwiseGemm,
ADataType, // TODO: distiguish A/B datatype
CDataType,
remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
remove_reference_t<
DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
OutElementwiseOperation,
InElementwiseOperation,
WeiElementwiseOperation,
remove_reference_t<DeviceOp::Block2CTileMap>,
false>;
Run(kernel);
}
else
{
const auto kernel = kernel_gemm_xdlops_bwd_weight<
GridwiseGemmAtomicAddFloatBf16Splitk,
ADataType, // TODO: distiguish A/B datatype
AccDataType,
remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
remove_reference_t<
DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
OutElementwiseOperation,
InElementwiseOperation,
WeiElementwiseOperation,
remove_reference_t<DeviceOp::Block2CTileMap>,
false>;
run_bf16_splitk(kernel);
}
} }
} }
else else
{ {
if(has_main_k0_block_loop) auto launch_kernel = [&](auto has_main_k_block_loop) {
{ constexpr bool has_main_loop = has_main_k_block_loop.value;
if(kbatch == 1) if(kbatch == 1)
{ {
const auto kernel = kernel_gemm_xdlops_bwd_weight< const auto kernel = kernel_gemm_xdlops_bwd_weight<
...@@ -1181,9 +1159,9 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -1181,9 +1159,9 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
InElementwiseOperation, InElementwiseOperation,
WeiElementwiseOperation, WeiElementwiseOperation,
remove_reference_t<DeviceOp::Block2CTileMap>, remove_reference_t<DeviceOp::Block2CTileMap>,
true>; has_main_loop>;
Run(kernel); return run_conv(kernel);
} }
else else
{ {
...@@ -1199,49 +1177,18 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ ...@@ -1199,49 +1177,18 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
InElementwiseOperation, InElementwiseOperation,
WeiElementwiseOperation, WeiElementwiseOperation,
remove_reference_t<DeviceOp::Block2CTileMap>, remove_reference_t<DeviceOp::Block2CTileMap>,
true>; has_main_loop>;
Run(kernel); return run_conv(kernel);
} }
};
if(has_main_k0_block_loop)
{
ave_time = launch_kernel(integral_constant<bool, true>{});
} }
else else
{ {
if(kbatch == 1) ave_time = launch_kernel(integral_constant<bool, false>{});
{
const auto kernel = kernel_gemm_xdlops_bwd_weight<
GridwiseGemm,
ADataType, // TODO: distiguish A/B datatype
CDataType,
remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
remove_reference_t<
DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
OutElementwiseOperation,
InElementwiseOperation,
WeiElementwiseOperation,
remove_reference_t<DeviceOp::Block2CTileMap>,
false>;
Run(kernel);
}
else
{
const auto kernel = kernel_gemm_xdlops_bwd_weight<
GridwiseGemmAtomicAdd,
ADataType, // TODO: distiguish A/B datatype
CDataType,
remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
remove_reference_t<
DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
OutElementwiseOperation,
InElementwiseOperation,
WeiElementwiseOperation,
remove_reference_t<DeviceOp::Block2CTileMap>,
false>;
Run(kernel);
}
} }
} }
......
#ifndef DEVICE_CONVND_BWD_DATA_XDL_NDHWC_KZYXC_NDHWK_HPP // SPDX-License-Identifier: MIT
#define DEVICE_CONVND_BWD_DATA_XDL_NDHWC_KZYXC_NDHWK_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "device.hpp"
#include "device_base.hpp" #include "ck/utility/common_header.hpp"
#include "device_conv_bwd_data.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "convolution_backward_data_specialization.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
#include "tensor_descriptor.hpp" #include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp" #include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -1546,4 +1549,3 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho ...@@ -1546,4 +1549,3 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <functional> #include <functional>
...@@ -6,16 +9,15 @@ ...@@ -6,16 +9,15 @@
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include "device.hpp" #include "ck/utility/common_header.hpp"
#include "device_prop.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "device_base.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "convolution_forward_specialization.hpp" #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "common_header.hpp" #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "tensor_layout.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
#include "tensor_descriptor.hpp" #include "ck/device_utility/device_prop.hpp"
#include "tensor_descriptor_helper.hpp" #include "ck/device_utility/kernel_launch.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
#include <iostream> #include <iostream>
#include "device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
......
#ifndef DEVICE_GEMM_BIAS_ACTIVATION_HPP // SPDX-License-Identifier: MIT
#define DEVICE_GEMM_BIAS_ACTIVATION_HPP // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream> #include <iostream>
#include "device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -40,4 +43,3 @@ using DeviceGemmBiasActivationPtr = std::unique_ptr< ...@@ -40,4 +43,3 @@ using DeviceGemmBiasActivationPtr = std::unique_ptr<
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
} // namespace ck } // namespace ck
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef DEVICE_GEMM_BIAS_ACTIVATION_ADD_HPP #ifndef DEVICE_GEMM_BIAS_ACTIVATION_ADD_HPP
#define DEVICE_GEMM_BIAS_ACTIVATION_ADD_HPP #define DEVICE_GEMM_BIAS_ACTIVATION_ADD_HPP
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment