Merge branch 'develop' into wavelet_model

7bcaf2a7 · Adam Osewski · GitHub · e59daa22 · 0345963e · 7bcaf2a7
Unverified Commit 7bcaf2a7 authored Dec 19, 2022 by Adam Osewski Committed by GitHub Dec 19, 2022
20 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -618,9 +618,9 @@ pipeline {
                stage('Clang Format') {
                    agent{ label rocmnode("nogpu") }
                    environment{
-                        execute_cmd = "find .. -iname \'*.h\' \
+                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
-                                -o -iname \'*.hpp\' \
+                                -o -not -path \'*.git*\' -iname \'*.hpp\' \
-                                -o -iname \'*.cpp\' \
+                                -o -not -path \'*.git*\' -iname \'*.cpp\' \
                                -o -iname \'*.h.in\' \
                                -o -iname \'*.hpp.in\' \
                                -o -iname \'*.cpp.in\' \

--- a/client_example/09_quantization/CMakeLists.txt
+++ b/client_example/09_quantization/CMakeLists.txt
+add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations)
 add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp)
 target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations)
+add_executable(client_conv2d_fwd_perchannel_quantization conv2d_fwd_perchannel_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_perchannel_quantization PRIVATE composable_kernel::device_operations)
 add_executable(client_conv2d_fwd_perlayer_quantization conv2d_fwd_perlayer_quantization.cpp)
 target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable_kernel::device_operations)
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using InDataType           = int8_t;
+using WeiDataType          = int8_t;
+using BiasDataType         = int32_t;
+using RequantScaleDataType = float;
+using OutDataType          = int8_t;
+using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout         = ck::tensor_layout::convolution::G_K;
+using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp       = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 4;
+static constexpr ck::index_t K             = 64;
+static constexpr ck::index_t C             = 32;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 71;
+static constexpr ck::index_t Wi            = 71;
+static constexpr ck::index_t Ho            = 36;
+static constexpr ck::index_t Wo            = 36;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+        NumDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, RequantScaleLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<BiasDataType, RequantScaleDataType>,
+        OutDataType,
+        PassThrough,
+        PassThrough,
+        OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {bias_lengths, requant_scale_lengths},
+                                        {bias_strides, requant_scale_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{ActivationOp{}});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop      = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+                                    G * sizeof(WeiDataType) * K * Y * X * C +
+                                    G * sizeof(OutDataType) * N * Ho * Wo * K;
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {bias_lengths, requant_scale_lengths},
+                                        {bias_strides, requant_scale_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{ActivationOp{}});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_bias_forward_perlayer_quantization.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using InDataType           = int8_t;
+using WeiDataType          = int8_t;
+using RequantScaleDataType = float;
+using OutDataType          = int8_t;
+using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp       = PassThrough;
+using OutElementOp       = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 4;
+static constexpr ck::index_t K             = 64;
+static constexpr ck::index_t C             = 32;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 71;
+static constexpr ck::index_t Wi            = 71;
+static constexpr ck::index_t Ho            = 36;
+static constexpr ck::index_t Wo            = 36;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    ck::Tuple<RequantScaleLayout>,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    ck::Tuple<RequantScaleDataType>,
+                                                                    OutDataType,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {requant_scale.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {requant_scale_lengths},
+                                                        {requant_scale_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{ActivationOp{}});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop      = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+                                    G * sizeof(WeiDataType) * K * Y * X * C +
+                                    G * sizeof(OutDataType) * N * Ho * Wo * K;
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{ActivationOp{}});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_perlayer_quantization.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

--- a/client_example/13_batchnorm/CMakeLists.txt
+++ b/client_example/13_batchnorm/CMakeLists.txt
 add_executable(client_batchnorm_fwd_nhwc batchnorm_fwd_nhwc.cpp)
+add_executable(client_batchnorm_bwd_nhwc batchnorm_bwd_nhwc.cpp)
 target_link_libraries(client_batchnorm_fwd_nhwc PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_batchnorm_bwd_nhwc PRIVATE composable_kernel::device_operations)
--- a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
+++ b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp"
+using XDataType           = ck::half_t;
+using DxDataType          = float;
+using DyDataType          = float;
+using AccDataType         = float;
+using ScaleDataType       = ck::half_t;
+using DscaleDbiasDataType = float;
+using MeanVarDataType     = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+constexpr int Rank                  = 4;
+constexpr int NumBatchNormReduceDim = 3;
+const double epsilon = std::numeric_limits<float>::epsilon();
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
+    std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+    ck::index_t numXYElement =
+        std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
+    ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
+                                                             scaleBiasMeanVarLengths.end(),
+                                                             1,
+                                                             std::multiplies<ck::index_t>());
+    SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
+    SimpleDeviceMem dy(sizeof(DyDataType) * numXYElement);
+    SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem dx(sizeof(DxDataType) * numXYElement);
+    SimpleDeviceMem dscale(sizeof(DscaleDbiasDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem dbias(sizeof(DscaleDbiasDataType) * numScaleBiasMeanVarElement);
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+                                                                      DxDataType,
+                                                                      DyDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      DscaleDbiasDataType,
+                                                                      MeanVarDataType,
+                                                                      PassThrough,
+                                                                      Rank,
+                                                                      NumBatchNormReduceDim>;
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        reduceDims,
+                                                        scaleBiasMeanVarLengths,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        x.GetDeviceBuffer(),
+                                                        dy.GetDeviceBuffer(),
+                                                        scale.GetDeviceBuffer(),
+                                                        mean.GetDeviceBuffer(),
+                                                        invVariance.GetDeviceBuffer(),
+                                                        epsilon,
+                                                        PassThrough{},
+                                                        dx.GetDeviceBuffer(),
+                                                        dscale.GetDeviceBuffer(),
+                                                        dbias.GetDeviceBuffer());
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t num_bytes =
+                numXYElement * (sizeof(XDataType) + sizeof(DyDataType) + sizeof(DxDataType)) +
+                numScaleBiasMeanVarElement *
+                    (sizeof(ScaleDataType) + sizeof(DscaleDbiasDataType) * 2 +
+                     sizeof(MeanVarDataType) * 2);
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+        // run the best intance
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        reduceDims,
+                                                        scaleBiasMeanVarLengths,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        x.GetDeviceBuffer(),
+                                                        dy.GetDeviceBuffer(),
+                                                        scale.GetDeviceBuffer(),
+                                                        mean.GetDeviceBuffer(),
+                                                        invVariance.GetDeviceBuffer(),
+                                                        epsilon,
+                                                        PassThrough{},
+                                                        dx.GetDeviceBuffer(),
+                                                        dscale.GetDeviceBuffer(),
+                                                        dbias.GetDeviceBuffer());
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/14_instance_id/CMakeLists.txt
+++ b/client_example/14_instance_id/CMakeLists.txt
+add_executable(client_batchnorm_fwd_instance_id batchnorm_fwd_instance_id.cpp)
+target_link_libraries(client_batchnorm_fwd_instance_id PRIVATE composable_kernel::device_operations)
--- a/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
+++ b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp"
+using XDataType       = float;
+using YDataType       = float;
+using AccDataType     = float;
+using ScaleDataType   = AccDataType;
+using BiasDataType    = AccDataType;
+using MeanVarDataType = AccDataType;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+constexpr int Rank                  = 4;
+constexpr int NumBatchNormReduceDim = 3;
+const double epsilon       = std::numeric_limits<float>::epsilon();
+const double averageFactor = 0.1;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+// In the actual application, the instance index and name are usually from the perf db
+static int instance_index = -1;
+static std::string instance_name;
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
+    std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+    ck::index_t numXYElement =
+        std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
+    ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
+                                                             scaleBiasMeanVarLengths.end(),
+                                                             1,
+                                                             std::multiplies<ck::index_t>());
+    SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
+    SimpleDeviceMem y(sizeof(YDataType) * numXYElement);
+    SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd<XDataType,
+                                                                      YDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      BiasDataType,
+                                                                      MeanVarDataType,
+                                                                      PassThrough,
+                                                                      Rank,
+                                                                      NumBatchNormReduceDim>;
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    bool found          = false;
+    int best_op_index   = -1;
+    float best_ave_time = std::numeric_limits<float>::max();
+    // profile device operation instances and save the best performant instance index and instance
+    // name
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        reduceDims,
+                                                        scaleBiasMeanVarLengths,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        x.GetDeviceBuffer(),
+                                                        scale.GetDeviceBuffer(),
+                                                        bias.GetDeviceBuffer(),
+                                                        epsilon,
+                                                        PassThrough{},
+                                                        y.GetDeviceBuffer(),
+                                                        mean.GetDeviceBuffer(),
+                                                        invVariance.GetDeviceBuffer(),
+                                                        averageFactor,
+                                                        nullptr,
+                                                        nullptr);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            if(ave_time < best_ave_time)
+            {
+                found         = true;
+                best_op_index = i;
+                best_ave_time = ave_time;
+            }
+        }
+    }
+    if(found)
+    {
+        instance_index = best_op_index;
+        instance_name  = op_ptrs[instance_index]->GetTypeIdHashCode();
+    };
+    // simulate the execution of the operation when the instance index and name are available
+    const auto op_ptrs_2 = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    if(instance_index >= 0 && instance_index < op_ptrs_2.size())
+    {
+        auto& op_ptr = op_ptrs_2[instance_index];
+        if(op_ptr->GetTypeIdHashCode() == instance_name)
+        {
+            auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                            xyStrides,
+                                                            xyStrides,
+                                                            reduceDims,
+                                                            scaleBiasMeanVarLengths,
+                                                            scaleBiasMeanVarStrides,
+                                                            scaleBiasMeanVarStrides,
+                                                            scaleBiasMeanVarStrides,
+                                                            x.GetDeviceBuffer(),
+                                                            scale.GetDeviceBuffer(),
+                                                            bias.GetDeviceBuffer(),
+                                                            epsilon,
+                                                            PassThrough{},
+                                                            y.GetDeviceBuffer(),
+                                                            mean.GetDeviceBuffer(),
+                                                            invVariance.GetDeviceBuffer(),
+                                                            averageFactor,
+                                                            nullptr,
+                                                            nullptr);
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+                SimpleDeviceMem workspace(workspace_sz);
+                op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+                float exec_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+                size_t num_bytes = numXYElement * (sizeof(XDataType) + sizeof(YDataType)) +
+                                   numScaleBiasMeanVarElement *
+                                       (sizeof(ScaleDataType) + sizeof(BiasDataType) +
+                                        sizeof(MeanVarDataType) + sizeof(MeanVarDataType));
+                float gb_per_sec = num_bytes / 1.E6 / exec_time;
+                std::cout << "Kernel execution time: " << std::setw(10) << exec_time
+                          << " ms,  effective data transfer bandwidth: " << gb_per_sec << " GB/s"
+                          << std::endl;
+            }
+        };
+    }
+    return 0;
+}
--- a/doc/markdown/dockerhub.md
+++ b/doc/markdown/dockerhub.md
+## CK docker hub
+[Docker hub](https://hub.docker.com/r/rocm/composable_kernel)
+## Why do I need this?
+To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images.
+## So what is Composable Kernel?
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+To get the CK library
+```
+git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git
+```
+run a docker container 
+```
+docker run                                                            \
+-it                                                                   \
+--privileged                                                          \
+--group-add sudo                                                      \
+-w /root/workspace                                                    \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
+rocm/composable_kernel:ck_ub20.04_rocm5.3_release                     \
+/bin/bash
+```
+and build the CK
+```
+mkdir build && cd build
+# Need to specify target ID, example below is for gfx908 and gfx90a
+cmake                                                                                             \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+-D CMAKE_CXX_FLAGS="-O3"                                                                          \
+-D CMAKE_BUILD_TYPE=Release                                                                       \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
+..
+```
+and 
+```
+make -j examples tests
+```
+To run all the test cases including tests and examples run
+```
+make test
+```
+We can also run specific examples or tests like
+```
+./bin/example_gemm_xdl_fp16
+./bin/test_gemm_fp16
+```
+For more details visit [CK github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel), [CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example), [even more CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example).
+## And what is inside?
+The docker images have everything you need for running CK including:
+* [ROCm](https://www.amd.com/en/graphics/servers-solutions-rocm)
+* [CMake](https://cmake.org/)
+* [Compiler](https://github.com/RadeonOpenCompute/llvm-project)
+## Which image is right for me?
+Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are:
+* "ck" - made for running Composable Kernel
+* "ub20.04" - based on Ubuntu 20.04
+* "rocm5.4" - ROCm platform version 5.4
+* "release" - compiler version is release
+So just pick the right image for your project dependencies and you're all set.
+## DIY starts here
+If you need to customize a docker image or just can't stop tinkering, feel free to adjust the [Dockerfile](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile) for your needs.
+## License
+CK is released under the MIT [license](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/LICENSE).
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -8,3 +8,4 @@ add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp6
 add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
 add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
 add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)
--- a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
@@ -30,6 +30,7 @@ void print_helper_msg()
 template <ck::index_t NDimSpatial,
          typename InDataType,
          typename WeiDataType,
+          typename DsDataType,
          typename OutDataType,
          typename InElementOp,
          typename WeiElementOp,
@@ -46,8 +47,10 @@ bool run_grouped_conv_fwd_dl(bool do_verification,
                             const WeiElementOp& wei_element_op,
                             const OutElementOp& out_element_op)
 {
+    using DDataType = ck::remove_cvref_t<ck::tuple_element_t<0, DsDataType>>;
    Tensor<InDataType> in(in_g_n_c_wis_desc);
    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<DDataType> bias(out_g_n_k_wos_desc);
    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
@@ -59,31 +62,38 @@ bool run_grouped_conv_fwd_dl(bool do_verification,
    {
    case 0: break;
    case 1:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 3});
-        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 3});
+        bias.GenerateTensorValue(GeneratorTensor_2<DDataType>{-2, 3});
        break;
    case 2:
        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
        break;
    default:
        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
-        wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+        wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{-1});
+        bias.GenerateTensorValue(GeneratorTensor_1<DDataType>{1});
    }
    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(DDataType) * bias.mDesc.GetElementSpaceSize());
    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> c_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> c_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
@@ -95,8 +105,10 @@ bool run_grouped_conv_fwd_dl(bool do_verification,
    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
-    copy(out_g_n_k_wos_desc.GetLengths(), c_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetLengths(), d_g_n_k_wos_lengths);
-    copy(out_g_n_k_wos_desc.GetStrides(), c_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
    copy(conv_param.conv_filter_strides_, conv_filter_strides);
    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
    copy(conv_param.input_left_pads_, input_left_pads);
@@ -105,25 +117,32 @@ bool run_grouped_conv_fwd_dl(bool do_verification,
    // do Conv
    auto conv     = DeviceConvNDFwdInstance{};
    auto invoker  = conv.MakeInvoker();
-    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+    auto argument = conv.MakeArgument(
-                                      wei_device_buf.GetDeviceBuffer(),
+        in_device_buf.GetDeviceBuffer(),
-                                      out_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
-                                      a_g_n_c_wis_lengths,
+        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
-                                      a_g_n_c_wis_strides,
+        out_device_buf.GetDeviceBuffer(),
-                                      b_g_k_c_xs_lengths,
+        a_g_n_c_wis_lengths,
-                                      b_g_k_c_xs_strides,
+        a_g_n_c_wis_strides,
-                                      c_g_n_k_wos_lengths,
+        b_g_k_c_xs_lengths,
-                                      c_g_n_k_wos_strides,
+        b_g_k_c_xs_strides,
-                                      conv_filter_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_lengths}},
-                                      conv_filter_dilations,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_strides}},
-                                      input_left_pads,
+        e_g_n_k_wos_lengths,
-                                      input_right_pads,
+        e_g_n_k_wos_strides,
-                                      in_element_op,
+        conv_filter_strides,
-                                      wei_element_op,
+        conv_filter_dilations,
-                                      out_element_op);
+        input_left_pads,
+        input_right_pads,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
    if(!conv.IsSupportedArgument(argument))
    {
+        std::cout << "wrong! device_conv with the specified compilation parameters does not "
+                     "support this Conv problem"
+                  << std::endl;
        return true;
    }
@@ -139,28 +158,34 @@ bool run_grouped_conv_fwd_dl(bool do_verification,
    if(do_verification)
    {
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
-                                                                     InDataType,
+            NDimSpatial,
-                                                                     WeiDataType,
+            InDataType,
-                                                                     OutDataType,
+            WeiDataType,
-                                                                     InElementOp,
+            OutDataType,
-                                                                     WeiElementOp,
+            InElementOp,
-                                                                     OutElementOp>();
+            WeiElementOp,
+            ck::tensor_operation::element_wise::PassThrough>();
-        auto ref_invoker  = ref_conv.MakeInvoker();
-        auto ref_argument = ref_conv.MakeArgument(in,
+        auto ref_invoker = ref_conv.MakeInvoker();
-                                                  wei,
+        auto ref_argument =
-                                                  out_host,
+            ref_conv.MakeArgument(in,
-                                                  conv_param.conv_filter_strides_,
+                                  wei,
-                                                  conv_param.conv_filter_dilations_,
+                                  out_host,
-                                                  conv_param.input_left_pads_,
+                                  conv_param.conv_filter_strides_,
-                                                  conv_param.input_right_pads_,
+                                  conv_param.conv_filter_dilations_,
-                                                  in_element_op,
+                                  conv_param.input_left_pads_,
-                                                  wei_element_op,
+                                  conv_param.input_right_pads_,
-                                                  out_element_op);
+                                  in_element_op,
+                                  wei_element_op,
+                                  ck::tensor_operation::element_wise::PassThrough{});
        ref_invoker.Run(ref_argument);
+        // cde_elementwise
+        out_host.ForEach(
+            [&](auto&, auto idx) { out_element_op(out_host(idx), out_host(idx), bias(idx)); });
        out_device_buf.FromDevice(out_device.mData.data());
        return ck::utils::check_err(

--- a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
@@ -3,13 +3,14 @@
 #include "convnd_fwd_dl_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
 using AccDataType = float;
+using DsDataType  = ck::Tuple<ck::half_t>;
 using OutDataType = ck::half_t;
 template <ck::index_t... Is>
@@ -17,7 +18,7 @@ using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
@@ -26,12 +27,12 @@ static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecial
 template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
 // clang-format off
-using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
+using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
-// ######|        NDim|     InData|     WeiData|     OutData|     AccData| InLayout| WeiLayout| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|        NDim|     InData|     WeiData|   MultpleD|     OutData|     AccData| InLayout| WeiLayout|            MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-// ######|     Spatial|       Type|        Type|        Type|        Type|         |          |          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|     Spatial|       Type|        Type|       Type|        Type|        Type|         |          |               Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-// ######|            |           |            |            |            |         |          |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|            |           |            |           |            |            |         |          |                     |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-// ######|            |           |            |            |            |         |          |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+// ######|            |           |            |           |            |            |         |          |                     |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-         < NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+         < NDimSpatial, InDataType, WeiDataType, DsDataType, OutDataType, AccDataType, InLayout, WeiLayout, ck::Tuple<OutLayout>, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
 // clang-format on
 #include "run_convnd_fwd_dl_example.inc"

--- a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
@@ -3,13 +3,14 @@
 #include "convnd_fwd_dl_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 using InDataType  = float;
 using WeiDataType = float;
 using AccDataType = float;
+using DsDataType  = ck::Tuple<float>;
 using OutDataType = float;
 template <ck::index_t... Is>
@@ -17,7 +18,7 @@ using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
@@ -26,12 +27,12 @@ static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecial
 template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
 // clang-format off
-using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
+using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
-// ######|        NDim|     InData|     WeiData|     OutData|     AccData| InLayout| WeiLayout| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|        NDim|     InData|     WeiData|   MultpleD|     OutData|     AccData| InLayout| WeiLayout|            MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-// ######|     Spatial|       Type|        Type|        Type|        Type|         |          |          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|     Spatial|       Type|        Type|       Type|        Type|        Type|         |          |               Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-// ######|            |           |            |            |            |         |          |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|            |           |            |           |            |            |         |          |                     |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-// ######|            |           |            |            |            |         |          |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+// ######|            |           |            |           |            |            |         |          |                     |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-         < NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+         < NDimSpatial, InDataType, WeiDataType, DsDataType, OutDataType, AccDataType, InLayout, WeiLayout, ck::Tuple<OutLayout>, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
 // clang-format on
 #include "run_convnd_fwd_dl_example.inc"

--- a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
@@ -3,13 +3,14 @@
 #include "convnd_fwd_dl_common.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 using InDataType  = int8_t;
 using WeiDataType = int8_t;
 using AccDataType = int32_t;
+using DsDataType  = ck::Tuple<int8_t>;
 using OutDataType = int8_t;
 template <ck::index_t... Is>
@@ -17,7 +18,7 @@ using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
@@ -26,12 +27,12 @@ static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecial
 template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
 // clang-format off
-using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
+using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
-// ######|        NDim|     InData|     WeiData|     OutData|     AccData| InLayout| WeiLayout| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|        NDim|     InData|     WeiData|   MultpleD|     OutData|     AccData| InLayout| WeiLayout|            MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
-// ######|     Spatial|       Type|        Type|        Type|        Type|         |          |          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|     Spatial|       Type|        Type|       Type|        Type|        Type|         |          |               Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
-// ######|            |           |            |            |            |         |          |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|            |           |            |           |            |            |         |          |                     |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
-// ######|            |           |            |            |            |         |          |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+// ######|            |           |            |           |            |            |         |          |                     |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
-         < NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+         < NDimSpatial, InDataType, WeiDataType, DsDataType, OutDataType, AccDataType, InLayout, WeiLayout, ck::Tuple<OutLayout>, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
 // clang-format on
 #include "run_convnd_fwd_dl_example.inc"

--- a/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
+++ b/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
@@ -61,6 +61,7 @@ bool run_convnd_fwd_dl_example(int argc, char* argv[])
            ndim_spatial_value,
            InDataType,
            WeiDataType,
+            DsDataType,
            OutDataType,
            InElementOp,
            WeiElementOp,

--- a/example/14_gemm_quantization/CMakeLists.txt
+++ b/example/14_gemm_quantization/CMakeLists.txt
+add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
+add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
\ No newline at end of file
--- a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using CDEElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using BiasDataType     = I32;
+using DsDataType       = ck::Tuple<BiasDataType>;
+using EDataType        = I8;
+using ALayout    = Row;
+using BLayout    = Col;
+using BiasLayout = Row;
+using DsLayout   = ck::Tuple<BiasLayout>;
+using ELayout    = Row;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<
+     ALayout,
+     BLayout,
+     DsLayout,
+     ELayout,
+     ADataType,
+     BDataType,
+     AccDataType,
+     CShuffleDataType,
+     DsDataType,
+     EDataType,
+     PassThrough,                // AElementwiseOperation,
+     PassThrough,                // BElementwiseOperation,
+     CDEElementOp,               // CDEElementwiseOperation,
+     GemmDefault,                // GemmSpecialization GemmSpec,
+     1,                          // NumGemmKPrefetchStage,
+     256,                        // BlockSize,
+     256,                        // MPerBlock,
+     128,                        // NPerBlock,
+     64,                         // KPerBlock,
+     16,                         // AK1,
+     16,                         // BK1,
+     32,                         // MPerXDL,
+     32,                         // NPerXDL,
+     4,                          // MXdlPerWave,
+     2,                          // NXdlPerWave,
+     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
+     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
+     2,                          // index_t ABlockTransferSrcVectorDim,
+     16,                         // index_t ABlockTransferSrcScalarPerVector,
+     16,                         // index_t ABlockTransferDstScalarPerVector_AK1,
+     1,                          // bool ABlockLdsExtraM,
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder,
+     2,                          // index_t BBlockTransferSrcVectorDim,
+     8,                          // index_t BBlockTransferSrcScalarPerVector,
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1,
+     1,                          // bool BBlockLdsExtraN,
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
+     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+    ck::index_t StrideA    = 1024;
+    ck::index_t StrideB    = 1024;
+    ck::index_t StrideBias = 0;
+    ck::index_t StrideE    = 1024;
+    float requant_scale = 0.03;
+    auto f_host_tensor_descriptor2d =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1_uz}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1_uz, stride}));
+            }
+        };
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor(std::vector<std::size_t>({len}),
+                                    std::vector<std::size_t>({stride}));
+    };
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "bias_n: " << bias_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-128, 127});
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-128, 127});
+    bias_n.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-128, 127});
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    bias_device_buf.ToDevice(bias_n.mData.data());
+    auto a_element_op   = PassThrough{};
+    auto b_element_op   = PassThrough{};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {bias_device_buf.GetDeviceBuffer()},
+                                      e_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      {StrideBias},
+                                      StrideE,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), bias_n(n));
+            }
+        }
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+    return 0;
+}
--- a/example/14_gemm_xdl_quantization/gemm_xdl_relu_quantization_int8.cpp
+++ b/example/14_gemm_xdl_quantization/gemm_xdl_relu_quantization_int8.cpp
@@ -9,7 +9,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -22,50 +22,59 @@
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
-using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using ActivationOp = PassThrough;
-using CElementOp   = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
-using ADataType        = int8_t;
+using ADataType        = I8;
-using BDataType        = int8_t;
+using BDataType        = I8;
-using CDataType        = int8_t;
+using AccDataType      = I32;
-using AccDataType      = int32_t;
+using CShuffleDataType = I32;
-using CShuffleDataType = float;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = I8;
-using ALayout = ck::tensor_layout::gemm::RowMajor;
+using ALayout  = Row;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using BLayout  = Col;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle<
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<
-     ALayout,                    // typename ALayout,
+     ALayout,
-     BLayout,                    // typename BLayout,
+     BLayout,
-     CLayout,                    // typename CLayout,
+     DsLayout,
-     ADataType,                  // typename ADataType,
+     ELayout,
-     BDataType,                  // typename BDataType,
+     ADataType,
-     CDataType,                  // typename CDataType,
+     BDataType,
-     AccDataType,                // typename GemmAccDataType,
+     AccDataType,
-     CShuffleDataType,           // typename CShuffleDataType,
+     CShuffleDataType,
-     PassThrough,                // typename AElementwiseOperation,
+     DsDataType,
-     PassThrough,                // typename BElementwiseOperation,
+     EDataType,
-     CElementOp,         // typename CElementwiseOperation,
+     PassThrough,                // AElementwiseOperation,
+     PassThrough,                // BElementwiseOperation,
+     CDEElementOp,               // CDEElementwiseOperation,
     GemmDefault,                // GemmSpecialization GemmSpec,
-     1,                          // index_t NumGemmKPrefetchStage,
+     1,                          // NumGemmKPrefetchStage,
-     256,                        // index_t BlockSize,
+     256,                        // BlockSize,
-     256,                        // index_t MPerBlock,
+     256,                        // MPerBlock,
-     128,                        // index_t NPerBlock,
+     128,                        // NPerBlock,
-     64,                         // index_t KPerBlock,
+     64,                         // KPerBlock,
-     16,                         // index_t AK1,
+     16,                         // AK1,
-     16,                         // index_t BK1,
+     16,                         // BK1,
-     32,                         // index_t MPerXDL,
+     32,                         // MPerXDL,
-     32,                         // index_t NPerXDL,
+     32,                         // NPerXDL,
-     4,                          // index_t MXdlPerWave,
+     4,                          // MXdlPerWave,
-     2,                          // index_t NXdlPerWave,
+     2,                          // NXdlPerWave,
-     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
-     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
-     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder,
+     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
     2,                          // index_t ABlockTransferSrcVectorDim,
     16,                         // index_t ABlockTransferSrcScalarPerVector,
     16,                         // index_t ABlockTransferDstScalarPerVector_AK1,
@@ -84,53 +93,23 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, float, PassThrough, PassThrough, CElementOp>;
+    ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
-int main(int argc, char* argv[])
+int main()
 {
    bool do_verification = true;
-    int init_method      = 1;
    bool time_kernel     = false;
    // GEMM shape
-    ck::index_t M = 3840;
+    ck::index_t M = 1024;
-    ck::index_t N = 4096;
+    ck::index_t N = 1024;
-    ck::index_t K = 4096;
+    ck::index_t K = 1024;
-    ck::index_t StrideA = 4096;
-    ck::index_t StrideB = 4096;
-    ck::index_t StrideC = 4096;
-    float quant_multiplier = 0.03;
-    if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-    }
-    else if(argc == 10)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        time_kernel     = std::stoi(argv[3]);
-        M = std::stoi(argv[4]);
+    ck::index_t StrideA = 1024;
-        N = std::stoi(argv[5]);
+    ck::index_t StrideB = 1024;
-        K = std::stoi(argv[6]);
+    ck::index_t StrideE = 1024;
-        StrideA = std::stoi(argv[7]);
+    float requant_scale = 0.03;
-        StrideB = std::stoi(argv[8]);
-        StrideC = std::stoi(argv[9]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: time kernel (0=n0, 1=yes)\n");
-        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
-        exit(0);
-    }
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
@@ -138,61 +117,56 @@ int main(int argc, char* argv[])
            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1_uz}));
            }
            else
            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1_uz, stride}));
            }
        };
    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
-    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
-    switch(init_method)
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-128, 127});
-    {
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-128, 127});
-    case 0: break;
-    case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        break;
-    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
-    }
-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    a_device_buf.ToDevice(a_m_k.mData.data());
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
-    auto a_element_op = PassThrough{};
+    auto a_element_op   = PassThrough{};
-    auto b_element_op = PassThrough{};
+    auto b_element_op   = PassThrough{};
-    auto c_element_op = CElementOp{quant_multiplier, ActivationOp{}};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
    // do GEMM
    auto gemm     = DeviceGemmInstance{};
    auto invoker  = gemm.MakeInvoker();
-    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
-                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      b_device_buf.GetDeviceBuffer(),
-                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      {},
+                                      e_device_buf.GetDeviceBuffer(),
                                      M,
                                      N,
                                      K,
                                      StrideA,
                                      StrideB,
-                                      StrideC,
+                                      {},
+                                      StrideE,
                                      a_element_op,
                                      b_element_op,
-                                      c_element_op);
+                                      cde_element_op);
    if(!gemm.IsSupportedArgument(argument))
    {
@@ -205,7 +179,7 @@ int main(int argc, char* argv[])
    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_btype =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -214,7 +188,7 @@ int main(int argc, char* argv[])
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << gemm.GetTypeString() << std::endl;
-    c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
    if(do_verification)
    {
@@ -222,11 +196,11 @@ int main(int argc, char* argv[])
        auto ref_invoker = ref_gemm.MakeInvoker();
        auto ref_argument = ref_gemm.MakeArgument(
-            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+            a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
        ref_invoker.Run(ref_argument);
-        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result) ? 0 : 1;
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
    }
    return 0;