Commit d8fdd226 authored by Chao Liu's avatar Chao Liu
Browse files

fix example build

parent ed3c27cc
...@@ -43,6 +43,7 @@ using ALayout = Row; ...@@ -43,6 +43,7 @@ using ALayout = Row;
using BLayout = Col; using BLayout = Col;
using D0Layout = Row; using D0Layout = Row;
using D1Layout = Row; using D1Layout = Row;
using DsLayout = ck::Tuple<D0Layout, D1Layout>;
using ELayout = Row; using ELayout = Row;
using AElementOp = PassThrough; using AElementOp = PassThrough;
...@@ -53,11 +54,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa ...@@ -53,11 +54,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// clang-format off // clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
//######| ALayout| BLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| //######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| //######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
// clang-format on // clang-format on
int main(int argc, char* argv[]) int main(int argc, char* argv[])
......
...@@ -63,15 +63,16 @@ using DeviceConvndBwdWeightInstance = ...@@ -63,15 +63,16 @@ using DeviceConvndBwdWeightInstance =
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
int num_dim_spatial = 2;
ck::utils::conv::ConvParam params{ ck::utils::conv::ConvParam conv_param{
2, 32, 256, 1024, {3, 3}, {14, 14}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; 2, 1, 32, 256, 1024, {3, 3}, {14, 14}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
ck::index_t split_k = 4; ck::index_t split_k = 4;
...@@ -87,12 +88,12 @@ int main(int argc, char* argv[]) ...@@ -87,12 +88,12 @@ int main(int argc, char* argv[])
} }
else else
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]); const ck::index_t num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, 5, argv); conv_param = parse_conv_param(num_dim_spatial, 5, argv);
split_k = std::stoi(argv[5 + 3 + 6 * num_dim_spatial - 1]); split_k = std::stoi(argv[5 + 3 + 6 * num_dim_spatial - 1]);
split_k = std::max(1, split_k); split_k = std::max(1, split_k);
...@@ -102,12 +103,22 @@ int main(int argc, char* argv[]) ...@@ -102,12 +103,22 @@ int main(int argc, char* argv[])
const auto wei_element_op = WeiElementOp{}; const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{}; const auto out_element_op = OutElementOp{};
if(num_dim_spatial == 1) if(conv_param.num_dim_spatial_ == 1)
{ {
using InLayout = ctc::GNWC;
using WeiLayout = ctc::GKXC;
using OutLayout = ctc::GNWK;
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_conv_bwd_weight<1, return run_conv_bwd_weight<1,
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC,
ck::tensor_layout::convolution::NWK,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
...@@ -117,18 +128,31 @@ int main(int argc, char* argv[]) ...@@ -117,18 +128,31 @@ int main(int argc, char* argv[])
DeviceConvndBwdWeightInstance<1>>(do_verification, DeviceConvndBwdWeightInstance<1>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op, out_element_op,
split_k); split_k);
} }
else if(num_dim_spatial == 2) else if(conv_param.num_dim_spatial_ == 2)
{ {
using InLayout = ctc::GNHWC;
using WeiLayout = ctc::GKYXC;
using OutLayout = ctc::GNHWK;
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_conv_bwd_weight<2, return run_conv_bwd_weight<2,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
...@@ -138,18 +162,31 @@ int main(int argc, char* argv[]) ...@@ -138,18 +162,31 @@ int main(int argc, char* argv[])
DeviceConvndBwdWeightInstance<2>>(do_verification, DeviceConvndBwdWeightInstance<2>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op, out_element_op,
split_k); split_k);
} }
else if(num_dim_spatial == 3) else if(conv_param.num_dim_spatial_ == 3)
{ {
using InLayout = ctc::GNDHWC;
using WeiLayout = ctc::GKZYXC;
using OutLayout = ctc::GNDHWK;
const auto in_g_n_c_wis_desc =
make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
return run_conv_bwd_weight<3, return run_conv_bwd_weight<3,
ck::tensor_layout::convolution::NDHWC,
ck::tensor_layout::convolution::KZYXC,
ck::tensor_layout::convolution::NDHWK,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
...@@ -159,7 +196,10 @@ int main(int argc, char* argv[]) ...@@ -159,7 +196,10 @@ int main(int argc, char* argv[])
DeviceConvndBwdWeightInstance<3>>(do_verification, DeviceConvndBwdWeightInstance<3>>(do_verification,
init_method, init_method,
time_kernel, time_kernel,
params, conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op, out_element_op,
......
...@@ -31,7 +31,6 @@ namespace device { ...@@ -31,7 +31,6 @@ namespace device {
* \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
* returns the 2D index of the tile that it computes. \see * returns the 2D index of the tile that it computes. \see
* GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run(). * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
*
* \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2 * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
* tiles from different matrices. Keep in mind that these 2 matrices can share the same grid * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
* descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
......
...@@ -39,6 +39,12 @@ struct PassThrough ...@@ -39,6 +39,12 @@ struct PassThrough
y = x; y = x;
} }
template <>
__host__ __device__ void operator()<int32_t, int32_t>(int32_t& y, const int32_t& x) const
{
y = x;
}
template <> template <>
__host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment