Merge branch 'develop' of...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into migx-jit-lib

Merge branch 'develop' of...
Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into migx-jit-lib
497c30e0 · Alan Turner · 17acbbf4 · 8b9cbba8 · 497c30e0 · 497c30e0
Commit 497c30e0 authored Apr 24, 2023 by Alan Turner
20 changed files
--- a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using InDataType   = int8_t;
+using WeiDataType  = int8_t;
+using BiasDataType = int32_t;
+using OutDataType  = int8_t;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
+using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout   = ck::tensor_layout::convolution::G_K;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::TanH;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 4;
+static constexpr ck::index_t N             = 4;    // batch size
+static constexpr ck::index_t K             = 32;   // output channel
+static constexpr ck::index_t C             = 64;   // input channel (per group)
+static constexpr ck::index_t Y             = 3;    // filter H
+static constexpr ck::index_t X             = 3;    // filter W
+static constexpr ck::index_t Hi            = 71;   // input H
+static constexpr ck::index_t Wi            = 71;   // input W
+static constexpr ck::index_t Ho            = 36;   // output H
+static constexpr ck::index_t Wo            = 36;   // output W
+static constexpr float sacc                = 0.5f; //  scale of acc
+static constexpr float sz_inv              = 0.5f; // inverse of scale_z
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * G * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    ck::Tuple<BiasLayout>,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    ck::Tuple<BiasDataType>,
+                                                                    OutDataType,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {bias.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {bias_lengths},
+                                                        {bias_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{sacc, sz_inv, ActivationOp{}});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes =
+                G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
+                G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    // run the best intance
+    if(best_op_id != -1)
+    {
+        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {bias.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {bias_lengths},
+                                                        {bias_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{sacc, sz_inv, ActivationOp{}});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -16,25 +16,25 @@ using WeiDataType          = int8_t;
 using RequantScaleDataType = float;
 using OutDataType          = int8_t;
-using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using InLayout           = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
 using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
-using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using OutLayout          = ck::tensor_layout::convolution::NHWGK;
 using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp       = PassThrough;
 using OutElementOp       = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
-static constexpr ck::index_t N             = 4;
+static constexpr ck::index_t N             = 4;  // batch size
-static constexpr ck::index_t K             = 64;
+static constexpr ck::index_t K             = 32; // output channel
-static constexpr ck::index_t C             = 32;
+static constexpr ck::index_t C             = 64; // input channel (per group)
-static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t Y             = 3;  // filter H
-static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t X             = 3;  // filter W
-static constexpr ck::index_t Hi            = 71;
+static constexpr ck::index_t Hi            = 71; // input H
-static constexpr ck::index_t Wi            = 71;
+static constexpr ck::index_t Wi            = 71; // input W
-static constexpr ck::index_t Ho            = 36;
+static constexpr ck::index_t Ho            = 36; // output H
-static constexpr ck::index_t Wo            = 36;
+static constexpr ck::index_t Wo            = 36; // output W
 struct SimpleDeviceMem
 {
@@ -54,23 +54,27 @@ struct SimpleDeviceMem
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
-    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * G * K);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp =
        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
@@ -130,10 +134,10 @@ int main(int argc, char* argv[])
        {
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
-            std::size_t flop      = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
-            std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+            std::size_t num_bytes =
-                                    G * sizeof(WeiDataType) * K * Y * X * C +
+                G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
-                                    G * sizeof(OutDataType) * N * Ho * Wo * K;
+                G * sizeof(RequantScaleDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;
@@ -156,11 +160,12 @@ int main(int argc, char* argv[])
        }
    }
-    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
-              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
    // run the best intance
+    if(best_op_id != -1)
    {
+        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
        auto& op_ptr = op_ptrs[best_op_id];
        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                  << std::endl;
@@ -195,4 +200,4 @@ int main(int argc, char* argv[])
    }
    return 0;
 }
\ No newline at end of file
--- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -15,24 +15,25 @@ using InDataType  = int8_t;
 using WeiDataType = int8_t;
 using OutDataType = int8_t;
-using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using InLayout     = ck::tensor_layout::convolution::NHWGC;
 using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
-using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using OutLayout    = ck::tensor_layout::convolution::NHWGK;
 using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
 using ActivationOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t G             = 4;
-static constexpr ck::index_t N             = 4;
+static constexpr ck::index_t N             = 4;    // batch size
-static constexpr ck::index_t K             = 64;
+static constexpr ck::index_t K             = 32;   // output channel
-static constexpr ck::index_t C             = 32;
+static constexpr ck::index_t C             = 64;   // input channel (per group)
-static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t Y             = 3;    // filter H
-static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t X             = 3;    // filter W
-static constexpr ck::index_t Hi            = 71;
+static constexpr ck::index_t Hi            = 71;   // input H
-static constexpr ck::index_t Wi            = 71;
+static constexpr ck::index_t Wi            = 71;   // input W
-static constexpr ck::index_t Ho            = 36;
+static constexpr ck::index_t Ho            = 36;   // output H
-static constexpr ck::index_t Wo            = 36;
+static constexpr ck::index_t Wo            = 36;   // output W
+static constexpr float requant_scale       = 0.5f; // requantize qAcc to qY
 struct SimpleDeviceMem
 {
@@ -52,20 +53,24 @@ struct SimpleDeviceMem
 int main(int argc, char* argv[])
 {
+    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space
+    // However, CK's API only accept length and stride with order of GNCHW/GKCYX/GNCHW
+    // Hence, we need to adjust the order of stride
    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
-    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
-    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
+    std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
-    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 5> out_strides{C, Ho * Wo * G * C, 1, Wo * G * C, G * C};
    std::array<ck::index_t, 2> in_left_pad{1, 1};
    std::array<ck::index_t, 2> in_right_pad{1, 1};
    std::array<ck::index_t, 2> conv_strides{2, 2};
    std::array<ck::index_t, 2> conv_dilations{1, 1};
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * G * K);
    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                 InLayout,
@@ -96,26 +101,27 @@ int main(int argc, char* argv[])
    for(int i = 0; i < op_ptrs.size(); ++i)
    {
-        auto& op_ptr      = op_ptrs[i];
+        auto& op_ptr = op_ptrs[i];
-        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+        auto argument_ptr =
-                                                        wei.GetDeviceBuffer(),
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
-                                                        {},
+                                        wei.GetDeviceBuffer(),
-                                                        out.GetDeviceBuffer(),
+                                        {},
-                                                        in_lengths,
+                                        out.GetDeviceBuffer(),
-                                                        in_strides,
+                                        in_lengths,
-                                                        weight_lengths,
+                                        in_strides,
-                                                        weight_strides,
+                                        weight_lengths,
-                                                        {},
+                                        weight_strides,
-                                                        {},
+                                        {},
-                                                        out_lengths,
+                                        {},
-                                                        out_strides,
+                                        out_lengths,
-                                                        conv_strides,
+                                        out_strides,
-                                                        conv_dilations,
+                                        conv_strides,
-                                                        in_left_pad,
+                                        conv_dilations,
-                                                        in_right_pad,
+                                        in_left_pad,
-                                                        PassThrough{},
+                                        in_right_pad,
-                                                        PassThrough{},
+                                        PassThrough{},
-                                                        OutElementOp{0.5f, ActivationOp{}});
+                                        PassThrough{},
+                                        OutElementOp{requant_scale, ActivationOp{}});
        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
        std::string op_name = op_ptr->GetTypeString();
@@ -150,33 +156,34 @@ int main(int argc, char* argv[])
        }
    }
-    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+    if(best_op_id != -1)
-              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
-    // run the best intance
    {
+        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
        auto& op_ptr = op_ptrs[best_op_id];
        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                  << std::endl;
-        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+        auto argument_ptr =
-                                                        wei.GetDeviceBuffer(),
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
-                                                        {},
+                                        wei.GetDeviceBuffer(),
-                                                        out.GetDeviceBuffer(),
+                                        {},
-                                                        in_lengths,
+                                        out.GetDeviceBuffer(),
-                                                        in_strides,
+                                        in_lengths,
-                                                        weight_lengths,
+                                        in_strides,
-                                                        weight_strides,
+                                        weight_lengths,
-                                                        {},
+                                        weight_strides,
-                                                        {},
+                                        {},
-                                                        out_lengths,
+                                        {},
-                                                        out_strides,
+                                        out_lengths,
-                                                        conv_strides,
+                                        out_strides,
-                                                        conv_dilations,
+                                        conv_strides,
-                                                        in_left_pad,
+                                        conv_dilations,
-                                                        in_right_pad,
+                                        in_left_pad,
-                                                        PassThrough{},
+                                        in_right_pad,
-                                                        PassThrough{},
+                                        PassThrough{},
-                                                        OutElementOp{0.5f, ActivationOp{}});
+                                        PassThrough{},
+                                        OutElementOp{requant_scale, ActivationOp{}});
        auto invoker_ptr = op_ptr->MakeInvokerPointer();

--- a/client_example/09_quantization/gemm_quantization.cpp
+++ b/client_example/09_quantization/gemm_quantization.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp"
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using ActivationOp = PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+using ADataType = int8_t;
+using BDataType = int8_t;
+using EDataType = int8_t;
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+    float requant_scale = 0.03;
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
+                                                                       BLayout,
+                                                                       ck::Tuple<>,
+                                                                       ELayout,
+                                                                       ADataType,
+                                                                       BDataType,
+                                                                       ck::Tuple<>,
+                                                                       EDataType,
+                                                                       AElementOp,
+                                                                       BElementOp,
+                                                                       CDEElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        {},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        {},
+                                                        StrideE,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t num_bytes =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    if(best_op_id != -1)
+    {
+        std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+                  << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        {},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        {},
+                                                        StrideE,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
+add_executable(client_grouped_conv1d_bwd_weight_fp16 grouped_conv1d_bwd_weight_fp16.cpp)
 add_executable(client_grouped_conv2d_bwd_weight_fp16 grouped_conv2d_bwd_weight_fp16.cpp)
 add_executable(client_grouped_conv3d_bwd_weight_fp16 grouped_conv3d_bwd_weight_fp16.cpp)
 add_executable(client_grouped_conv3d_bwd_weight_fp32 grouped_conv3d_bwd_weight_fp32.cpp)
+target_link_libraries(client_grouped_conv1d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations)
 target_link_libraries(client_grouped_conv2d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations)
 target_link_libraries(client_grouped_conv3d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations)
 target_link_libraries(client_grouped_conv3d_bwd_weight_fp32 PRIVATE composable_kernel::device_operations)
--- a/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using InLayout  = ck::tensor_layout::convolution::GNWC;
+using WeiLayout = ck::tensor_layout::convolution::GKXC;
+using OutLayout = ck::tensor_layout::convolution::GNWK;
+static constexpr ck::index_t NumDimSpatial = 1;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Wo            = 28;
+int main()
+{
+    return run_grouped_conv_bwd_weight<NumDimSpatial,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout>(G, N, K, C, {Wi}, {X}, {Wo}, {1}, {1}, {1}, {1})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/17_grouped_gemm_fastgelu/CMakeLists.txt
+++ b/client_example/17_grouped_gemm_fastgelu/CMakeLists.txt
+add_executable(client_grouped_gemm_fastgelu grouped_gemm_fastgelu.cpp)
+target_link_libraries(client_grouped_gemm_fastgelu PRIVATE composable_kernel::device_operations)
\ No newline at end of file
--- a/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
+++ b/client_example/17_grouped_gemm_fastgelu/grouped_gemm_fastgelu.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include <random>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_fastgelu.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+using ADataType  = F16;
+using BDataType  = F16;
+using DsDataType = ck::Tuple<>;
+using EDataType  = F16;
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = FastGelu;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main()
+{
+    std::mt19937 gen(19391);
+    std::uniform_int_distribution<> distrib(1, 10);
+    int group_count = distrib(gen);
+    std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
+    for(int i = 0; i < group_count; ++i)
+    {
+        Ms.push_back(256 + 256 * distrib(gen));
+        Ns.push_back(256 + 256 * distrib(gen));
+        Ks.push_back(128 + 128 * distrib(gen));
+        StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
+        StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
+        StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
+    }
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, e_dev_bufs;
+    a_dev_bufs.reserve(group_count);
+    b_dev_bufs.reserve(group_count);
+    e_dev_bufs.reserve(group_count);
+    std::vector<const void*> p_a, p_b;
+    std::vector<void*> p_e;
+    p_a.reserve(group_count);
+    p_b.reserve(group_count);
+    p_e.reserve(group_count);
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+    gemm_descs.reserve(group_count);
+    for(int i = 0; i < group_count; ++i)
+    {
+        a_dev_bufs.emplace_back(sizeof(ADataType) *
+                                f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
+        b_dev_bufs.emplace_back(sizeof(BDataType) *
+                                f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
+        e_dev_bufs.emplace_back(sizeof(EDataType) *
+                                f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
+        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideEs[i], {}});
+        p_a.push_back(a_dev_bufs[i].GetDeviceBuffer());
+        p_b.push_back(b_dev_bufs[i].GetDeviceBuffer());
+        p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
+    }
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm<ALayout,
+                                                                     BLayout,
+                                                                     DsLayout,
+                                                                     ELayout,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     DsDataType,
+                                                                     EDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     CDEElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    auto p_ds = std::vector<std::array<const void*, 0>>{};
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        SimpleDeviceMem gemm_desc_workspace(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop = 0, num_btype = 0;
+            for(std::size_t j = 0; j < gemm_descs.size(); ++j)
+            {
+                flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
+                num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
+                             sizeof(EDataType) * Ms[j] * Ns[j];
+            }
+            float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    if(found)
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        SimpleDeviceMem gemm_desc_workspace(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/18_groupnorm/CMakeLists.txt
+++ b/client_example/18_groupnorm/CMakeLists.txt
+add_executable(client_groupnorm_swish groupnorm_swish.cpp)
+target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations)
--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp"
+using XDataType       = ck::half_t;
+using GammaDataType   = float;
+using BetaDataType    = float;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using Swish           = ck::tensor_operation::element_wise::Swish;
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 32;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 64;
+    ck::index_t C = 128;
+    std::size_t xy_size         = N * H * W * G * C;
+    std::size_t gamma_beta_size = G * C;
+    std::vector<ck::index_t> xy_strides         = {H * W * G * C, W * G * C, G * C, C, 1};
+    std::vector<ck::index_t> gamma_beta_strides = {0, 0, 0, C, 1};
+    SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size);
+    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
+                                                                       GammaDataType,
+                                                                       BetaDataType,
+                                                                       ComputeDataType,
+                                                                       YDataType,
+                                                                       Swish,
+                                                                       Rank,
+                                                                       NumReduceDim>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t num_byte =
+                sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size +
+                sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size;
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},    // lengths
+                                                        xy_strides,         // xStrides
+                                                        gamma_beta_strides, // gammaStrides
+                                                        gamma_beta_strides, // betaStrides
+                                                        xy_strides,         // yStrides
+                                                        {1, 2, 4},          // reduceDims
+                                                        1e-6,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        Swish{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -65,7 +65,8 @@ else()
            -Wuninitialized
            -Wunreachable-code
            -Wunused
+            -Wno-reserved-identifier
+            -Werror
            -Wsign-compare
            -Wno-extra-semi-stmt
        )

--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -21,6 +21,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
     -Wno-comma
     -Wno-old-style-cast
     -Wno-deprecated
+     -Wno-unsafe-buffer-usage
 )
 message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")

--- a/doc/markdown/dockerhub.md
+++ b/doc/markdown/dockerhub.md
-## CK docker hub
-[Docker hub](https://hub.docker.com/r/rocm/composable_kernel)
-## Why do I need this?
-To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images.
-## So what is Composable Kernel?
-Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
-To get the CK library
-```
-git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git
-```
-run a docker container 
-```
-docker run                                                            \
-it                                                                   \
--privileged                                                          \
--group-add sudo                                                      \
-w /root/workspace                                                    \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
-rocm/composable_kernel:ck_ub20.04_rocm5.3_release                     \
-/bin/bash
-```
-and build the CK
-```
-mkdir build && cd build
-# Need to specify target ID, example below is for gfx908 and gfx90a
-cmake                                                                                             \
-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
-D CMAKE_CXX_FLAGS="-O3"                                                                          \
-D CMAKE_BUILD_TYPE=Release                                                                       \
-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
-..
-```
-and 
-```
-make -j examples tests
-```
-To run all the test cases including tests and examples run
-```
-make test
-```
-We can also run specific examples or tests like
-```
-./bin/example_gemm_xdl_fp16
-./bin/test_gemm_fp16
-```
-For more details visit [CK github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel), [CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example), [even more CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example).
-## And what is inside?
-The docker images have everything you need for running CK including:
-* [ROCm](https://www.amd.com/en/graphics/servers-solutions-rocm)
-* [CMake](https://cmake.org/)
-* [Compiler](https://github.com/RadeonOpenCompute/llvm-project)
-## Which image is right for me?
-Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are:
-* "ck" - made for running Composable Kernel
-* "ub20.04" - based on Ubuntu 20.04
-* "rocm5.4" - ROCm platform version 5.4
-* "release" - compiler version is release
-So just pick the right image for your project dependencies and you're all set.
-## DIY starts here
-If you need to customize a docker image or just can't stop tinkering, feel free to adjust the [Dockerfile](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile) for your needs.
-## License
-CK is released under the MIT [license](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/LICENSE).
--- a/doc/markdown/tutorial_hello_world.md
+++ b/doc/markdown/tutorial_hello_world.md
-## CK Hello world
-## Motivation
-This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who would like to optimize their pipelines and squeeze every performance drop by adding Composable Kernel (CK) library to their projects. We would like to make the CK library approachable so the tutorial is not based on the latest release and doesn't have all the bleeding edge features, but it will be reproducible now and forever.
-During this tutorial we will have an introduction to the CK library, we will build it and run some examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go in depth and breadth and get familiar with other tools and ways to integrate CK into your project.
-## Description
-Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create new ones. The library has components required for majority of modern neural networks architectures including matrix multiplication, convolution, contraction, reduction, attention modules, variety of activation functions, fused operators and many more.
-So how do we (almost) reach the speed of light? CK acceleration abilities are based on:
-* Layered structure.
-* Tile-based computation model.
-* Tensor coordinate transformation.
-* Hardware acceleration use.
-* Support of low precision data types including fp16, bf16, int8 and int4.
-If you are excited and need more technical details and benchmarking results - read this awesome blog [post](https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224). 
-For more details visit our [github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel).
-## Hardware targets
-CK library fully supports "gfx908" and "gfx90a" GPU architectures and only some operators are supported for "gfx1030". Let's check the hardware you have at hand and decide on the target GPU architecture 
-GPU Target	AMD GPU
-gfx908 	Radeon Instinct MI100
-gfx90a 	Radeon Instinct MI210, MI250, MI250X
-gfx1030 	Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT
-There are also [cloud options](https://aws.amazon.com/ec2/instance-types/g4/) you can find if you don't have an AMD GPU at hand.
-## Build the library
-First let's clone the library and rebase to the tested version:
-```
-git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git
-cd composable_kernel/
-git checkout tutorial_hello_world
-```
-To make our lives easier we prepared [docker images](https://hub.docker.com/r/rocm/composable_kernel) with all the necessary dependencies. Pick the right image and create a container. In this tutorial we use "rocm/composable_kernel:ck_ub20.04_rocm5.3_release" image, it is based on Ubuntu 20.04, ROCm v5.3, compiler release version.
-If your current folder is ${HOME}, start the docker container with
-```
-docker run  \
-it  \
--privileged  \
--group-add sudo  \
-w /root/workspace  \
-v ${HOME}:/root/workspace  \
-rocm/composable_kernel:ck_ub20.04_rocm5.3_release  \
-/bin/bash
-```
-If your current folder is different from ${HOME}, adjust the line `-v ${HOME}:/root/workspace` to fit your folder structure.
-Inside the docker container current folder is "~/workspace", library path is "~/workspace/composable_kernel", navigate to the library
-```
-cd composable_kernel/
-```
-Create and go to the "build" directory
-```
-mkdir build && cd build
-```
-In the previous section we talked about target GPU architecture. Once you decide which one is right for you, run cmake using the right GPU_TARGETS flag
-```
-cmake  \
-D CMAKE_PREFIX_PATH=/opt/rocm  \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc  \
-D CMAKE_CXX_FLAGS="-O3"  \
-D CMAKE_BUILD_TYPE=Release  \
-D BUILD_DEV=OFF  \
-D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..
-```
-If everything went well the cmake run will end up with:
-```
-- Configuring done
-- Generating done
-- Build files have been written to: "/root/workspace/composable_kernel/build"
-```
-Finally, we can build examples and tests
-```
-make -j examples tests
-```
-If everything is smooth, you'll see
-```
-Scanning dependencies of target tests
-[100%] Built target tests
-```
-## Run examples and tests
-Examples are listed as test cases as well, so we can run all examples and tests with
-```
-ctest
-```
-You can check the list of all tests by running
-```
-ctest -N
-```
-We can also run them separately, here is a separate example execution. 
-```
-./bin/example_gemm_xdl_fp16 1 1 1
-```
-The arguments "1 1 1" mean that we want to run this example in the mode: verify results with CPU, initialize matrices with integers and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change.
-If everything goes well and you have a device based on gfx908 or gfx90a architecture you should see something like
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 1.10017 ms, 117.117 TFlops, 87.6854 GB/s, DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1
-```
-Meanwhile, running it on a gfx1030 device should result in
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem
-```
-But don't panic, some of the operators are supported on gfx1030 architecture, so you can run a separate example like
-```
-./bin/example_gemm_dl_fp16 1 1 1
-```
-and it should result in something nice similar to
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096}
-b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
-arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
-arg.c_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
-Warm up 1 time
-Start running 10 times...
-Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1>
-```
-Or we can run a separate test
-```
-ctest -R test_gemm_fp16
-```
-If everything goes well you should see something like
-```
-Start 121: test_gemm_fp16
-1/1 Test #121: test_gemm_fp16 ...................   Passed   51.81 sec
-100% tests passed, 0 tests failed out of 1
-```
-## Summary
-In this tutorial we took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different configs to find out the best one for your hardware and task.
-P.S.: Don't forget to switch out the cloud instance if you have launched one, you can find better ways to spend your money for sure!
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -51,7 +51,7 @@ PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and
 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
 # the logo to the output directory.
-PROJECT_LOGO           = ./rocm.jpg
+PROJECT_LOGO           = 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
 # into which the generated documentation will be written. If a relative path is
@@ -775,8 +775,10 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
-INPUT                  = ../library/include \
+INPUT                  = ../../include/ck/tensor_operation/gpu/grid \
-                         ../library/include/internal
+                         ../../include/ck/tensor_operation/gpu/block \
+                         ../../include/ck/tensor_operation/gpu/thread \
+                         ../../library/include/ck/library/utility
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -845,7 +847,7 @@ FILE_PATTERNS          = *.c \
 # be searched for input files as well.
 # The default value is: NO.
-RECURSIVE              = NO
+RECURSIVE              = YES
 # The EXCLUDE tag can be used to specify files and/or directories that should be
 # excluded from the INPUT source files. This way you can easily exclude a

--- a/docs/.sphinx/_toc.yml.in
+++ b/docs/.sphinx/_toc.yml.in
+root: index
--- a/docs/.sphinx/requirements.in
+++ b/docs/.sphinx/requirements.in
+rocm-docs-core==0.2.0
+sphinxcontrib-bibtex==2.5.0
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile .sphinx/requirements.in
+#
+accessible-pygments==0.0.3
+    # via pydata-sphinx-theme
+alabaster==0.7.13
+    # via sphinx
+asttokens==2.2.1
+    # via stack-data
+attrs==22.2.0
+    # via
+    #   jsonschema
+    #   jupyter-cache
+babel==2.12.1
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+backcall==0.2.0
+    # via ipython
+beautifulsoup4==4.11.2
+    # via pydata-sphinx-theme
+breathe==4.34.0
+    # via rocm-docs-core
+certifi==2022.12.7
+    # via requests
+cffi==1.15.1
+    # via pynacl
+charset-normalizer==3.1.0
+    # via requests
+click==8.1.3
+    # via
+    #   jupyter-cache
+    #   sphinx-external-toc
+comm==0.1.2
+    # via ipykernel
+debugpy==1.6.6
+    # via ipykernel
+decorator==5.1.1
+    # via ipython
+deprecated==1.2.13
+    # via pygithub
+docutils==0.16
+    # via
+    #   breathe
+    #   myst-parser
+    #   pybtex-docutils
+    #   pydata-sphinx-theme
+    #   rocm-docs-core
+    #   sphinx
+    #   sphinxcontrib-bibtex
+executing==1.2.0
+    # via stack-data
+fastjsonschema==2.16.3
+    # via nbformat
+gitdb==4.0.10
+    # via gitpython
+gitpython==3.1.31
+    # via rocm-docs-core
+greenlet==2.0.2
+    # via sqlalchemy
+idna==3.4
+    # via requests
+imagesize==1.4.1
+    # via sphinx
+importlib-metadata==6.0.0
+    # via
+    #   jupyter-cache
+    #   myst-nb
+ipykernel==6.21.3
+    # via myst-nb
+ipython==8.11.0
+    # via
+    #   ipykernel
+    #   myst-nb
+jedi==0.18.2
+    # via ipython
+jinja2==3.1.2
+    # via
+    #   myst-parser
+    #   sphinx
+jsonschema==4.17.3
+    # via nbformat
+jupyter-cache==0.5.0
+    # via myst-nb
+jupyter-client==8.0.3
+    # via
+    #   ipykernel
+    #   nbclient
+jupyter-core==5.3.0
+    # via
+    #   ipykernel
+    #   jupyter-client
+    #   nbformat
+latexcodec==2.0.1
+    # via pybtex
+linkify-it-py==1.0.3
+    # via myst-parser
+markdown-it-py==2.2.0
+    # via
+    #   mdit-py-plugins
+    #   myst-parser
+markupsafe==2.1.2
+    # via jinja2
+matplotlib-inline==0.1.6
+    # via
+    #   ipykernel
+    #   ipython
+mdit-py-plugins==0.3.5
+    # via myst-parser
+mdurl==0.1.2
+    # via markdown-it-py
+myst-nb==0.17.1
+    # via rocm-docs-core
+myst-parser[linkify]==0.18.1
+    # via
+    #   myst-nb
+    #   rocm-docs-core
+nbclient==0.5.13
+    # via
+    #   jupyter-cache
+    #   myst-nb
+nbformat==5.7.3
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   nbclient
+nest-asyncio==1.5.6
+    # via
+    #   ipykernel
+    #   nbclient
+packaging==23.0
+    # via
+    #   ipykernel
+    #   pydata-sphinx-theme
+    #   sphinx
+parso==0.8.3
+    # via jedi
+pexpect==4.8.0
+    # via ipython
+pickleshare==0.7.5
+    # via ipython
+platformdirs==3.1.1
+    # via jupyter-core
+prompt-toolkit==3.0.38
+    # via ipython
+psutil==5.9.4
+    # via ipykernel
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.2
+    # via stack-data
+pybtex==0.24.0
+    # via
+    #   pybtex-docutils
+    #   sphinxcontrib-bibtex
+pybtex-docutils==1.0.2
+    # via sphinxcontrib-bibtex
+pycparser==2.21
+    # via cffi
+pydata-sphinx-theme==0.13.1
+    # via sphinx-book-theme
+pygithub==1.57
+    # via rocm-docs-core
+pygments==2.14.0
+    # via
+    #   accessible-pygments
+    #   ipython
+    #   pydata-sphinx-theme
+    #   sphinx
+pyjwt==2.6.0
+    # via pygithub
+pynacl==1.5.0
+    # via pygithub
+pyrsistent==0.19.3
+    # via jsonschema
+python-dateutil==2.8.2
+    # via jupyter-client
+pyyaml==6.0
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   myst-parser
+    #   pybtex
+    #   sphinx-external-toc
+pyzmq==25.0.1
+    # via
+    #   ipykernel
+    #   jupyter-client
+requests==2.28.2
+    # via
+    #   pygithub
+    #   sphinx
+rocm-docs-core==0.2.0
+    # via -r .sphinx/requirements.in
+six==1.16.0
+    # via
+    #   asttokens
+    #   latexcodec
+    #   pybtex
+    #   python-dateutil
+smmap==5.0.0
+    # via gitdb
+snowballstemmer==2.2.0
+    # via sphinx
+soupsieve==2.4
+    # via beautifulsoup4
+sphinx==4.3.1
+    # via
+    #   breathe
+    #   myst-nb
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   rocm-docs-core
+    #   sphinx-book-theme
+    #   sphinx-copybutton
+    #   sphinx-design
+    #   sphinx-external-toc
+    #   sphinx-notfound-page
+    #   sphinxcontrib-bibtex
+sphinx-book-theme==1.0.0rc2
+    # via rocm-docs-core
+sphinx-copybutton==0.5.1
+    # via rocm-docs-core
+sphinx-design==0.3.0
+    # via rocm-docs-core
+sphinx-external-toc==0.3.1
+    # via rocm-docs-core
+sphinx-notfound-page==0.8.3
+    # via rocm-docs-core
+sphinxcontrib-applehelp==1.0.4
+    # via sphinx
+sphinxcontrib-bibtex==2.5.0
+    # via -r .sphinx/requirements.in
+sphinxcontrib-devhelp==1.0.2
+    # via sphinx
+sphinxcontrib-htmlhelp==2.0.1
+    # via sphinx
+sphinxcontrib-jsmath==1.0.1
+    # via sphinx
+sphinxcontrib-qthelp==1.0.3
+    # via sphinx
+sphinxcontrib-serializinghtml==1.1.5
+    # via sphinx
+sqlalchemy==1.4.46
+    # via jupyter-cache
+stack-data==0.6.2
+    # via ipython
+tabulate==0.9.0
+    # via jupyter-cache
+tornado==6.2
+    # via
+    #   ipykernel
+    #   jupyter-client
+traitlets==5.9.0
+    # via
+    #   comm
+    #   ipykernel
+    #   ipython
+    #   jupyter-client
+    #   jupyter-core
+    #   matplotlib-inline
+    #   nbclient
+    #   nbformat
+typing-extensions==4.5.0
+    # via
+    #   myst-nb
+    #   myst-parser
+uc-micro-py==1.0.1
+    # via linkify-it-py
+urllib3==1.26.15
+    # via requests
+wcwidth==0.2.6
+    # via prompt-toolkit
+wrapt==1.15.0
+    # via deprecated
+zipp==3.15.0
+    # via importlib-metadata
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
--- a/docs/source/API_Reference_Guide.rst
+++ b/docs/source/API_Reference_Guide.rst
-===================
+*******************
 API Reference Guide
-===================
+*******************
------------
+=================
 Introduction
------------
+=================
 This document contains details of the APIs for the Composable Kernel (CK) library and introduces some of the key design
 principles that are used to write new classes that extend CK functionality.
@@ -16,8 +16,37 @@ Using CK API
 This section describes how to use the CK library API.
-----------------
+=================
 CK Datatypes
+=================
+-----------------
+DeviceMem
 -----------------
-[TODO]
+.. doxygenstruct:: DeviceMem
\ No newline at end of file
+---------------------------
+Kernels For Flashattention
+---------------------------
+The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`.  This sections lists the classes that are
+used in the CK GPU implementation of Flashattention.
+**Gridwise classes**
+.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
+**Blockwise classes**
+.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1
+.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2
+.. doxygenstruct:: ck::BlockwiseSoftmax
+**Threadwise classes**
+.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
+.. bibliography::
--- a/docs/source/Contributors_Guide.rst
+++ b/docs/source/Contributors_Guide.rst