Merge remote-tracking branch 'origin/develop' into migraphx-update

ef326c73 · Alan Turner · b7775add · e4dfe4d8 · ef326c73 · ef326c73
Commit ef326c73 authored Nov 19, 2024 by Alan Turner
20 changed files
--- a/example/09_convnd_fwd/README.md
+++ b/example/09_convnd_fwd/README.md
@@ -16,17 +16,3 @@
 # <right padding>, (ie RightPy, RightPx for 2D)
 ./bin/example_convnd_fwd_xdl 0 1 100
 ```
-Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32)
-```
-input: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-weights: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-output: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{432, 165888, 4}
-arg.b_grid_desc_k0_n_k1_{432, 256, 4}
-arg.c_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 100 times...
-Perf: 4.43736 ms, 33.0753 TFlops, 150.357 GB/s
-```
--- a/example/09_convnd_fwd/convnd_fwd_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <cstdlib>
 #include <iostream>
@@ -27,6 +27,88 @@ void print_helper_msg()
              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
 }
+template <typename DataType>
+inline __host__ __device__ constexpr double get_rtol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 1e-1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 1.5e-1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
+template <typename DataType>
+inline __host__ __device__ constexpr double get_atol()
+{
+    if constexpr(std::is_same_v<DataType, float>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, double>)
+    {
+        return 1e-6;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::half_t>)
+    {
+        return 1e-3;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
+    {
+        return 5e-2;
+    }
+    else if constexpr(std::is_same_v<DataType, int32_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, int8_t>)
+    {
+        return 1e-1;
+    }
+    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
+    {
+        return 16.1; // 240 and 224 are acceptable
+    }
+    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
+    {
+        return 8192.1; // 57344 and 49152 are acceptable
+    }
+    else
+    {
+        return 1e-3;
+    }
+}
 template <ck::index_t NDimSpatial,
          typename InDataType,
          typename WeiDataType,
@@ -164,8 +246,11 @@ bool run_grouped_conv_fwd(bool do_verification,
        out_device_buf.FromDevice(out_device.mData.data());
-        return ck::utils::check_err(
+        return ck::utils::check_err(out_device,
-            out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+                                    out_host,
+                                    "Error: incorrect results!",
+                                    get_rtol<OutDataType>(),
+                                    get_atol<OutDataType>());
    }
    return true;

--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
@@ -3,7 +3,7 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
 template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,

--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = ck::bf8_t;
+using WeiDataType      = ck::bf8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::f8_t;
+using OutDataType      = ck::f8_t;
+using ComputeType      = ck::bf8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8,
+        ComputeType>;
+#include "run_convnd_fwd_example.inc"
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf8_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = ck::bf8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::f8_t;
+using OutDataType      = ck::f8_t;
+using AComputeType     = ck::bf8_t;
+using BComputeType     = ck::f8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8,
+        AComputeType,
+        BComputeType>;
+#include "run_convnd_fwd_example.inc"
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -3,7 +3,7 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
 template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16_comp_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = ck::half_t;
+using WeiDataType      = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using OutDataType      = ck::half_t;
+using ComputeType      = ck::f8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8,
+        ComputeType>;
+#include "run_convnd_fwd_example.inc"
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -3,7 +3,7 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
 template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -3,7 +3,7 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
 template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,

--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::f8_t;
+using OutDataType      = ck::f8_t;
+using ComputeDataType  = ck::f8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8,
+        ComputeDataType>;
+#include "run_convnd_fwd_example.inc"
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp8_bf8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "convnd_fwd_common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::bf8_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::f8_t;
+using OutDataType      = ck::f8_t;
+using AComputeType     = ck::f8_t;
+using BComputeType     = ck::bf8_t;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8,
+        AComputeType,
+        BComputeType>;
+#include "run_convnd_fwd_example.inc"
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -3,7 +3,7 @@
 #include "convnd_fwd_common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
 template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
-    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,

--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
-list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+add_custom_target(example_convnd_fwd_reduce_xdl)
-set(target 0)
+add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
-foreach(gpu IN LISTS GPU_TARGETS)
+add_example_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
- if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_custom_target(example_convnd_fwd_reduce_xdl)
+add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
-   if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
+add_example_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
-    add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
-    add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
+add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
-   endif()
+add_example_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
-   if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
-    add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
+add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
-    add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
+add_example_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
-   endif()
-   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+if(USE_BITINT_EXTENSION_INT4)
-    add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
+    add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
-    add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
+    add_example_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)
-   endif()
+endif(USE_BITINT_EXTENSION_INT4)
-   if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
-    add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
-    add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
-   endif()
-   if(USE_BITINT_EXTENSION_INT4)
-      add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
-      add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)
-   endif(USE_BITINT_EXTENSION_INT4)
-   set(target 1)
- endif()
-endforeach()
\ No newline at end of file
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <algorithm>
 #include <cassert>
@@ -80,7 +80,7 @@ using RLayout = typename LayoutSettingSelector<NDimSpatial>::RLayout;
 struct ExecutionConfig final
 {
    bool do_verification = true;
-    int init_method      = 1;
+    int init_method      = 2;
    bool time_kernel     = false;
 };
@@ -139,7 +139,7 @@ inline bool parse_cmd_args(int argc,
 inline HostTensorDescriptor
 make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
 {
-    std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
+    std::vector<ck::long_index_t> dimensions{problem_size.G_, problem_size.N_};
    ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions));

--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-#error Should compile this file with ck::int4_t support
-#endif
 #define BUILD_INT4_EXAMPLE
@@ -24,3 +22,4 @@ using RsDataType        = ck::Tuple<R0DataType>;
 #include "run_convnd_fwd_max_example.inc"
 int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
+#endif
--- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
@@ -73,16 +73,25 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
    Tensor<EDataType> conv_output_device(conv_output_g_n_k_wos_desc);
    Tensor<R0DataType> r0_device(r0_desc);
+    std::cout << "input: " << conv_input.mDesc << std::endl;
+    std::cout << "weight: " << conv_weight.mDesc << std::endl;
+    std::cout << "output: " << conv_output_device.mDesc << std::endl;
+    std::cout << "reduction: " << r0_device.mDesc << std::endl << std::endl;
    switch(config.init_method)
    {
    case 0: break;
    case 1:
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-1, 1}(conv_weight);
+        break;
+    case 2:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
+        ck::utils::FillUniformDistribution<BDataType>{-1, 1}(conv_weight);
        break;
    default:
-        ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input);
+        ck::utils::FillUniformDistribution<ADataType>{-8, 7}(conv_input);
-        ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight);
+        ck::utils::FillUniformDistribution<BDataType>{-1, 1}(conv_weight);
    }
    DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
@@ -161,15 +170,25 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
        return false;
    }
+    // XXX: DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle will not initialize r0.
+    r0_device_buf.SetValue(ck::NumericLimits<R0DataType>::Lowest());
    const float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
-    const std::size_t flop      = problem_size.GetFlops();
+    if(config.time_kernel)
-    const std::size_t num_btype = problem_size.GetByte<ADataType, BDataType, EDataType>();
+    {
+        const std::size_t flop      = problem_size.GetFlops();
+        const std::size_t num_btype = problem_size.GetByte<ADataType, BDataType, EDataType>();
-    const float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+        const float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-    const float gb_per_sec = num_btype / 1.E6 / avg_time;
+        const float gb_per_sec = num_btype / 1.E6 / avg_time;
-    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+        std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-              << conv.GetTypeString() << std::endl;
+                  << " GB/s, " << conv.GetTypeString() << std::endl;
+    }
+    else
+    {
+        std::cout << "FINISHED: " << conv.GetTypeString() << std::endl;
+    }
    if(config.do_verification)
    {
@@ -189,6 +208,7 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
                                                  BElementOp{},
                                                  PassThrough{});
+        std::cout << "\nRunning verification on CPU." << std::endl;
        ref_invoker.Run(ref_argument);
        Tensor<R0DataType> r0_host(r0_device.mDesc);
@@ -273,13 +293,18 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
        conv_output_device_buf.FromDevice(conv_output_device.mData.data());
        r0_device_buf.FromDevice(r0_device.mData.data());
-        return ck::utils::check_err(conv_output_device,
+        auto pass = ck::utils::check_err(conv_output_device,
-                                    conv_output_host,
+                                         conv_output_host,
-                                    "Error: incorrect results! (Matrix E)",
+                                         "Error: incorrect results! (Matrix E)",
-                                    1e-5f,
+                                         1e-3f,
-                                    1e-4f) &&
+                                         1e-3f);
-               ck::utils::check_err(
+        pass =
-                   r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f);
+            pass && ck::utils::check_err(
+                        r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-3f, 1e-3f);
+        if(pass)
+            std::cout << "Verification on CPU: PASS" << std::endl;
+        return pass;
    }
    return true;

--- a/example/12_reduce/CMakeLists.txt
+++ b/example/12_reduce/CMakeLists.txt
 add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
+add_example_executable(example_reduce_threadwise_multi_d reduce_threadwise_multi_d.cpp)
 add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp)
 add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp)
--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
@@ -2,7 +2,7 @@
 ## Run ```example_reduce_blockwise```
 ```bash
-# -D <xxx> : input 3d/4d/5d tensor lengths
+# -D <xxx> : input 3D/4D/5D tensor lengths
 # -R <xxx> : reduce dimension ids
 # -v <x> :   verification (0=no, 1=yes)
 #arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4)
@@ -22,7 +22,7 @@ Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSr
 ## Run ```example_reduce_multiblock_atomic_add```
 ```bash
-# -D <xxx> : input 3d/4d/5d tensor lengths
+# -D <xxx> : input 3D/4D/5D tensor lengths
 # -R <xxx> : reduce dimension ids
 # -v <x> :   verification (0=no, 1=yes)
 #arg1: data type (0: fp32, 1: fp64)

--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <initializer_list>
@@ -255,34 +255,61 @@ int main(int argc, char* argv[])
    else
    {
        // for testing half_t
+        pass =
+            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass =
            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing float
+        pass =
+            pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing double
+        pass =
+            pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing bhalf_t
+        pass = pass &&
+               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                   true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass &&
               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
                   true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing int8_t
+        pass =
+            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass =
            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
        // for testing int4_t using AVG operation
+        pass =
+            pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing int4_t using MAX operation
+        pass =
+            pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 #endif

--- a/example/12_reduce/reduce_blockwise_impl.hpp
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -316,7 +316,17 @@ int reduce_blockwise_impl(bool do_verification,
    auto invoker_ptr = reduce.MakeInvokerPointer();
-    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+    int log_level = 0, cold_niters = 5, nrepeat = 50;
+    if(beta != 0.0f)
+    {
+        std::cerr << "Warning: With beta != 0.0f there must be only one repeat for correct results "
+                     "since out memory is being overwritten."
+                  << std::endl;
+        cold_niters = 0;
+        nrepeat     = 1;
+    }
+    float avg_time = invoker_ptr->Run(
+        argument_ptr.get(), StreamConfig{nullptr, time_kernel, log_level, cold_niters, nrepeat});
    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
                            invariant_total_length * sizeof(InOutDataType);