fix example build

d8fdd226 · Chao Liu · ed3c27cc · d8fdd226 · d8fdd226 · d8fdd226
Commit d8fdd226 authored Jul 26, 2022 by Chao Liu
4 changed files
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -43,6 +43,7 @@ using ALayout  = Row;
 using BLayout  = Col;
 using D0Layout = Row;
 using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
 using ELayout  = Row;

 using AElementOp   = PassThrough;
@@ -53,11 +54,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa

 // clang-format off
 using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
-//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-//######|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 // clang-format on

 int main(int argc, char* argv[])

--- a/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_bf16.cpp
+++ b/example/20_convnd_bwd_weight/convnd_bwd_weight_xdl_bf16.cpp
@@ -63,15 +63,16 @@ using DeviceConvndBwdWeightInstance =

 int main(int argc, char* argv[])
 {
+    namespace ctc = ck::tensor_layout::convolution;
+
    print_helper_msg();

    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
-    int num_dim_spatial  = 2;

-    ck::utils::conv::ConvParam params{
-        2, 32, 256, 1024, {3, 3}, {14, 14}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    ck::utils::conv::ConvParam conv_param{
+        2, 1, 32, 256, 1024, {3, 3}, {14, 14}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};

    ck::index_t split_k = 4;

@@ -87,12 +88,12 @@ int main(int argc, char* argv[])
    }
    else
    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        num_dim_spatial = std::stoi(argv[4]);
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);

-        params = parse_conv_params(num_dim_spatial, 5, argv);
+        conv_param = parse_conv_param(num_dim_spatial, 5, argv);

        split_k = std::stoi(argv[5 + 3 + 6 * num_dim_spatial - 1]);
        split_k = std::max(1, split_k);
@@ -102,12 +103,22 @@ int main(int argc, char* argv[])
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};

-    if(num_dim_spatial == 1)
+    if(conv_param.num_dim_spatial_ == 1)
    {
+        using InLayout  = ctc::GNWC;
+        using WeiLayout = ctc::GKXC;
+        using OutLayout = ctc::GNWK;
+
+        const auto in_g_n_c_wis_desc =
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
        return run_conv_bwd_weight<1,
-                                   ck::tensor_layout::convolution::NWC,
-                                   ck::tensor_layout::convolution::KXC,
-                                   ck::tensor_layout::convolution::NWK,
                                   InDataType,
                                   WeiDataType,
                                   OutDataType,
@@ -117,18 +128,31 @@ int main(int argc, char* argv[])
                                   DeviceConvndBwdWeightInstance<1>>(do_verification,
                                                                     init_method,
                                                                     time_kernel,
-                                                                     params,
+                                                                     conv_param,
+                                                                     in_g_n_c_wis_desc,
+                                                                     wei_g_k_c_xs_desc,
+                                                                     out_g_n_k_wos_desc,
                                                                     in_element_op,
                                                                     wei_element_op,
                                                                     out_element_op,
                                                                     split_k);
    }
-    else if(num_dim_spatial == 2)
+    else if(conv_param.num_dim_spatial_ == 2)
    {
+        using InLayout  = ctc::GNHWC;
+        using WeiLayout = ctc::GKYXC;
+        using OutLayout = ctc::GNHWK;
+
+        const auto in_g_n_c_wis_desc =
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
        return run_conv_bwd_weight<2,
-                                   ck::tensor_layout::convolution::NHWC,
-                                   ck::tensor_layout::convolution::KYXC,
-                                   ck::tensor_layout::convolution::NHWK,
                                   InDataType,
                                   WeiDataType,
                                   OutDataType,
@@ -138,18 +162,31 @@ int main(int argc, char* argv[])
                                   DeviceConvndBwdWeightInstance<2>>(do_verification,
                                                                     init_method,
                                                                     time_kernel,
-                                                                     params,
+                                                                     conv_param,
+                                                                     in_g_n_c_wis_desc,
+                                                                     wei_g_k_c_xs_desc,
+                                                                     out_g_n_k_wos_desc,
                                                                     in_element_op,
                                                                     wei_element_op,
                                                                     out_element_op,
                                                                     split_k);
    }
-    else if(num_dim_spatial == 3)
+    else if(conv_param.num_dim_spatial_ == 3)
    {
+        using InLayout  = ctc::GNDHWC;
+        using WeiLayout = ctc::GKZYXC;
+        using OutLayout = ctc::GNDHWK;
+
+        const auto in_g_n_c_wis_desc =
+            make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
        return run_conv_bwd_weight<3,
-                                   ck::tensor_layout::convolution::NDHWC,
-                                   ck::tensor_layout::convolution::KZYXC,
-                                   ck::tensor_layout::convolution::NDHWK,
                                   InDataType,
                                   WeiDataType,
                                   OutDataType,
@@ -159,7 +196,10 @@ int main(int argc, char* argv[])
                                   DeviceConvndBwdWeightInstance<3>>(do_verification,
                                                                     init_method,
                                                                     time_kernel,
-                                                                     params,
+                                                                     conv_param,
+                                                                     in_g_n_c_wis_desc,
+                                                                     wei_g_k_c_xs_desc,
+                                                                     out_g_n_k_wos_desc,
                                                                     in_element_op,
                                                                     wei_element_op,
                                                                     out_element_op,

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp
@@ -31,7 +31,6 @@ namespace device {
 * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
 * returns the 2D index of the tile that it computes. \see
 * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
- *
 * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
 * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
 * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -39,6 +39,12 @@ struct PassThrough
        y = x;
    }

+    template <>
+    __host__ __device__ void operator()<int32_t, int32_t>(int32_t& y, const int32_t& x) const
+    {
+        y = x;
+    }
+
    template <>
    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
    {