Merge branch 'develop' into uif2-migraphx

e5ebcc41 · Artur Wojcik · 57cdd70b · abac8b07 · e5ebcc41 · e5ebcc41
Commit e5ebcc41 authored Feb 19, 2024 by Artur Wojcik
20 changed files
--- a/client_example/25_wrapper/wrapper_basic_gemm.cpp
+++ b/client_example/25_wrapper/wrapper_basic_gemm.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <numeric>
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/wrapper/layout.hpp"
+#include "ck/wrapper/tensor.hpp"
+#include "ck/wrapper/operations/copy.hpp"
+#include "ck/wrapper/operations/gemm.hpp"
+#include "ck/wrapper/utils/kernel_utils.hpp"
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+template <typename DataType,
+          typename GemmTraits,
+          ck::index_t scalar_per_vector,
+          typename BlockShape,
+          typename ThreadLayout>
+__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
+                                                        const void* p_b,
+                                                        void* p_c,
+                                                        const ck::index_t M,
+                                                        const ck::index_t N,
+                                                        const ck::index_t K,
+                                                        const BlockShape tile_shape,
+                                                        const ThreadLayout thread_layout)
+{
+    constexpr auto MPerBlock = ck::wrapper::size<0>(tile_shape);
+    constexpr auto NPerBlock = ck::wrapper::size<1>(tile_shape);
+    constexpr auto KPerBlock = ck::wrapper::size<2>(tile_shape);
+
+    // Specify layouts for global memory.
+    const auto a_global_layout =
+        ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
+    const auto b_global_layout =
+        ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
+    const auto c_global_layout =
+        ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
+    // Specify layouts for tiles.
+    constexpr auto a_tile_layout = ck::wrapper::make_layout(
+        ck::make_tuple(MPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
+    constexpr auto b_tile_layout = ck::wrapper::make_layout(
+        ck::make_tuple(NPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
+    constexpr auto c_tile_layout = ck::wrapper::make_layout(
+        ck::make_tuple(MPerBlock, NPerBlock), ck::make_tuple(NPerBlock, ck::Number<1>{}));
+    // Apply padding for global memory.
+    auto a_global_layout_padded = ck::wrapper::pad(a_global_layout, shape(a_tile_layout));
+    auto b_global_layout_padded = ck::wrapper::pad(b_global_layout, shape(b_tile_layout));
+    auto c_global_layout_padded = ck::wrapper::pad(c_global_layout, shape(c_tile_layout));
+    // Make tensors for global memory.
+    auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<const DataType*>(p_a), a_global_layout_padded);
+    auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<const DataType*>(p_b), b_global_layout_padded);
+    auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<DataType*>(p_c), c_global_layout_padded);
+    // Allocate lds memory.
+    __shared__ DataType lds_a[ck::wrapper::size(a_tile_layout)];
+    __shared__ DataType lds_b[ck::wrapper::size(b_tile_layout)];
+    // Make tensors for lds memory.
+    auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
+        static_cast<DataType*>(lds_a), a_tile_layout);
+    auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
+        static_cast<DataType*>(lds_b), b_tile_layout);
+    // Specify block index as tuple.
+    const auto block_idxs = ck::make_tuple(static_cast<ck::index_t>(blockIdx.x),
+                                           static_cast<ck::index_t>(blockIdx.y),
+                                           ck::wrapper::slice());
+    // Specify access parameters for copy.
+    using DimAccessOrder             = ck::Tuple<ck::Number<0>, ck::Number<1>>;
+    constexpr ck::index_t vector_dim = 1;
+    // Create tile and partition for C. Use specific function for blockwise_gemm to assign the
+    // appropriate partitions.
+    auto c_global_local_tile = ck::wrapper::make_local_tile(
+        c_global_tensor,
+        tile_shape,
+        block_idxs,
+        make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(KPerBlock)));
+    auto c_global_local_partition =
+        ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
+                                                               decltype(a_tile_layout),
+                                                               decltype(b_tile_layout),
+                                                               ck::wrapper::size(thread_layout),
+                                                               GemmTraits>(c_global_local_tile);
+    // Create C vgpr to accumulate results.
+    auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
+                                                                  decltype(a_tile_layout),
+                                                                  decltype(b_tile_layout),
+                                                                  ck::wrapper::size(thread_layout),
+                                                                  GemmTraits>();
+    // Clear C vgpr.
+    ck::wrapper::clear(c_vgpr_reg);
+
+    // Iterate over K with KPerBlock step.
+    const ck::index_t num_loop = ck::math::integer_divide_ceil(K, KPerBlock);
+    ck::index_t i              = 0;
+    do
+    {
+        // Get KPerBlock slice.
+        const auto k_slice           = ck::wrapper::slice(i * KPerBlock, (i + 1) * KPerBlock);
+        auto a_global_tensor_k_slice = a_global_tensor(ck::wrapper::slice(), k_slice);
+        auto b_global_tensor_k_slice = b_global_tensor(ck::wrapper::slice(), k_slice);
+        // Create local tiles for A and B.
+        auto a_global_local_tile = ck::wrapper::make_local_tile(
+            a_global_tensor_k_slice,
+            tile_shape,
+            block_idxs,
+            make_tuple(ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}));
+        auto b_global_local_tile = ck::wrapper::make_local_tile(
+            b_global_tensor_k_slice,
+            tile_shape,
+            block_idxs,
+            make_tuple(ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}));
+        // Copy from global to lds.
+        ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
+            a_global_local_tile, a_lds_tensor, thread_layout);
+        ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
+            b_global_local_tile, b_lds_tensor, thread_layout);
+        // Synchronize lds.
+        ck::block_sync_lds();
+        // Execute blockwise gemm.
+        ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
+            a_lds_tensor, b_lds_tensor, c_vgpr_reg);
+
+        ++i;
+    } while(i < num_loop);
+    // Copy vgpr results to C global memory.
+    ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
+}
+
+template <typename DataType,
+          typename GemmTraits,
+          ck::index_t scalar_per_vector,
+          typename BlockShape,
+          typename ThreadLayout>
+void PerformGemm(const ck::index_t M,
+                 const ck::index_t N,
+                 const ck::index_t K,
+                 const BlockShape& tile_shape,
+                 const ThreadLayout& thread_layout)
+{
+    // Global memory buffers
+    SimpleDeviceMem a_mem(M * K * sizeof(DataType));
+    SimpleDeviceMem b_mem(K * N * sizeof(DataType));
+    SimpleDeviceMem c_mem(M * N * sizeof(DataType));
+
+    const ck::index_t grid_size_x =
+        ck::math::integer_divide_ceil(M, ck::wrapper::size<0>(tile_shape));
+    const ck::index_t grid_size_y =
+        ck::math::integer_divide_ceil(N, ck::wrapper::size<1>(tile_shape));
+
+    const auto kernel =
+        DeviceGemm<DataType, GemmTraits, scalar_per_vector, BlockShape, ThreadLayout>;
+    const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
+                                                  kernel,
+                                                  dim3(grid_size_x, grid_size_y, 1),
+                                                  dim3(ck::wrapper::size(thread_layout)),
+                                                  0,
+                                                  a_mem.GetDeviceBuffer(),
+                                                  b_mem.GetDeviceBuffer(),
+                                                  c_mem.GetDeviceBuffer(),
+                                                  M,
+                                                  N,
+                                                  K,
+                                                  tile_shape,
+                                                  thread_layout);
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(DataType) * M * K + sizeof(DataType) * K * N + sizeof(DataType) * M * N;
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+              << gb_per_sec << " GB/s, " << std::endl;
+}
+
+int main(int argc, char* argv[])
+{
+    using DataType = ck::half_t;
+    const auto thread_layout =
+        ck::wrapper::make_layout(ck::make_tuple(ck::Number<64>{}, ck::Number<4>{}),
+                                 ck::make_tuple(ck::Number<4>{}, ck::Number<1>{}));
+    const auto tile_shape = ck::make_tuple(ck::Number<256>{}, ck::Number<128>{}, ck::Number<32>{});
+    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_8K1, 8>(
+        3840, 4096, 4096, tile_shape, thread_layout);
+    return 0;
+}
+// MI300X Perf:   0.471337 ms, 273.369 TFlops, 204.671 GB/s,
--- a/client_example/25_wrapper/wrapper_img2col.cpp
+++ b/client_example/25_wrapper/wrapper_img2col.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <numeric>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/wrapper/layout.hpp"
+#include "ck/wrapper/tensor.hpp"
+#include "ck/wrapper/operations/copy.hpp"
+#include "ck/wrapper/utils/kernel_utils.hpp"
+
+static constexpr ck::index_t NumDimSpatial = 3;
+using DataType                             = float;
+using InputLayout                          = ck::tensor_layout::convolution::NDHWGC;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+template <typename InputTensor, typename OutputTensor, typename BlockShape, typename ThreadLayout>
+__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__
+DeviceImageToColumnPad0(InputTensor input_tensor,
+                        OutputTensor output_tensor,
+                        const BlockShape tile_shape,
+                        const ThreadLayout thread_layout)
+{
+    // grid layout (dim1, dim0)
+    const auto block_idxs =
+        ck::make_tuple(static_cast<ck::index_t>(blockIdx.y), static_cast<ck::index_t>(blockIdx.x));
+
+    // Get local tiles for global memory
+    auto input_local_tile  = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs);
+    auto output_local_tile = ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs);
+
+    // Get partition per thread
+    const auto input_local_partition =
+        ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
+    auto output_local_partition =
+        ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
+
+    // Perform copy
+    using DimAccessOrder                    = ck::Tuple<ck::Number<0>, ck::Number<1>>;
+    constexpr ck::index_t vector_dim        = 1;
+    constexpr ck::index_t scalar_per_vector = 4;
+    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
+                                                                     output_local_partition);
+}
+
+void PerformImageToColumnPad0(const ck::index_t G,
+                              const ck::index_t N,
+                              const ck::index_t Di,
+                              const ck::index_t Hi,
+                              const ck::index_t Wi,
+                              const ck::index_t Do,
+                              const ck::index_t Ho,
+                              const ck::index_t Wo,
+                              const ck::index_t C,
+                              const ck::index_t Z,
+                              const ck::index_t Y,
+                              const ck::index_t X,
+                              std::array<ck::index_t, NumDimSpatial> filter_strides,
+                              std::array<ck::index_t, NumDimSpatial> filter_dilations)
+{
+    const ck::index_t ZYXC = Z * Y * X * C;
+    const ck::index_t GC   = G * C;
+
+    // shape: (G, (Wo, Ho, Do, N)), (C, X, Y, Z))
+    const auto shape = ck::make_tuple(ck::make_tuple(G, ck::make_tuple(Wo, Ho, Do, N)),
+                                      ck::make_tuple(C, X, Y, Z));
+    const auto in_strides =
+        ck::make_tuple(ck::make_tuple(C,
+                                      ck::make_tuple(filter_strides[2] * GC,
+                                                     filter_strides[1] * Wi * GC,
+                                                     filter_strides[0] * Hi * Wi * GC,
+                                                     Di * Hi * Wi * GC)),
+                       ck::make_tuple(1,
+                                      filter_dilations[2] * GC,
+                                      filter_dilations[1] * Wi * GC,
+                                      filter_dilations[0] * Hi * Wi * GC));
+    const auto in_layout = ck::wrapper::make_layout(shape, in_strides);
+
+    const auto out_strides = ck::make_tuple(
+        ck::make_tuple(
+            ZYXC,
+            ck::make_tuple(ZYXC * G, Wo * ZYXC * G, Ho * Wo * ZYXC * G, Do * Ho * Wo * ZYXC * G)),
+        ck::make_tuple(1, C, X * C, Y * X * C));
+    const auto out_layout = ck::wrapper::make_layout(shape, out_strides);
+
+    const ck::index_t input_size = N * Di * Hi * Wi * GC;
+    // Global memory buffers
+    SimpleDeviceMem in_buf(input_size * sizeof(DataType));
+    SimpleDeviceMem out_buf(ck::wrapper::size(out_layout) * sizeof(DataType));
+
+    // User can choose appropriate number of threads and sizes per block
+    const auto thread_layout =
+        ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{}, ck::Number<16>{}),
+                                 ck::make_tuple(ck::Number<16>{}, ck::Number<1>{}));
+    // This example doesn't support padding, user should select tile sizes
+    // which are divisible by the shape.
+    const auto tile_shape = ck::make_tuple(ck::Number<32>{}, ck::Number<64>{});
+
+    // Create buffers for global memory
+    auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<const DataType*>(in_buf.GetDeviceBuffer()), in_layout);
+    auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<DataType*>(out_buf.GetDeviceBuffer()), out_layout);
+
+    // grid layout (dim1, dim0)
+    const ck::index_t grid_size_x = ck::math::integer_divide_ceil(ck::wrapper::size<1>(in_layout),
+                                                                  ck::wrapper::size<1>(tile_shape));
+    const ck::index_t grid_size_y = ck::math::integer_divide_ceil(ck::wrapper::size<0>(in_layout),
+                                                                  ck::wrapper::size<0>(tile_shape));
+
+    const auto kernel    = DeviceImageToColumnPad0<decltype(input_tensor_global),
+                                                decltype(output_tensor_global),
+                                                decltype(tile_shape),
+                                                decltype(thread_layout)>;
+    const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
+                                                  kernel,
+                                                  dim3(grid_size_x, grid_size_y, 1),
+                                                  dim3(ck::wrapper::size(thread_layout)),
+                                                  0,
+                                                  input_tensor_global,
+                                                  output_tensor_global,
+                                                  tile_shape,
+                                                  thread_layout);
+
+    std::size_t num_btype = G * N * Do * Ho * Wo * ZYXC * 2 * sizeof(DataType);
+    float gb_per_sec      = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+              << std::endl;
+}
+
+int main(int argc, char* argv[])
+{
+    constexpr ck::index_t G  = 4;  // number of groups
+    constexpr ck::index_t N  = 32; // batch
+    constexpr ck::index_t C  = 64; // input channel (per group)
+    constexpr ck::index_t Z  = 3;  // filter D
+    constexpr ck::index_t Y  = 3;  // filter H
+    constexpr ck::index_t X  = 3;  // filter W
+    constexpr ck::index_t Di = 9;  // input D
+    constexpr ck::index_t Hi = 9;  // input H
+    constexpr ck::index_t Wi = 7;  // input W
+    constexpr ck::index_t Do = 7;  // output D
+    constexpr ck::index_t Ho = 7;  // output H
+    constexpr ck::index_t Wo = 5;  // output W
+    PerformImageToColumnPad0(G,
+                             N,
+                             Di,
+                             Hi,
+                             Wi,
+                             Do,
+                             Ho,
+                             Wo,
+                             C,
+                             Z,
+                             Y,
+                             X,
+                             {1, 1, 1} /*filter_strides*/,
+                             {1, 1, 1} /*filter_dilations*/);
+    return 0;
+}
+// MI100 Perf:   0.255178 ms, 1698.9 GB/s,
--- a/client_example/25_wrapper/wrapper_optimized_gemm.cpp
+++ b/client_example/25_wrapper/wrapper_optimized_gemm.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <numeric>
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+
+#include "ck/library/utility/host_tensor.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/wrapper/layout.hpp"
+#include "ck/wrapper/tensor.hpp"
+#include "ck/wrapper/operations/copy.hpp"
+#include "ck/wrapper/operations/gemm.hpp"
+#include "ck/wrapper/utils/kernel_utils.hpp"
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+template <bool DoPad, typename Layout, typename PaddingDims>
+__device__ auto ApplyPadding(const Layout& layout, const PaddingDims& padding_dims)
+{
+    if constexpr(DoPad)
+    {
+        return ck::wrapper::pad(layout, padding_dims);
+    }
+    else
+    {
+        return layout;
+    }
+}
+
+template <typename DataType,
+          typename GemmTraits,
+          ck::index_t scalar_per_vector,
+          typename BlockShape,
+          typename ThreadLayout,
+          bool DoPadding>
+__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
+                                                        const void* p_b,
+                                                        void* p_c,
+                                                        const ck::index_t M,
+                                                        const ck::index_t N,
+                                                        const ck::index_t K,
+                                                        const BlockShape tile_shape,
+                                                        const ThreadLayout thread_layout)
+{
+    constexpr auto MPerBlock  = ck::wrapper::size<0>(tile_shape);
+    constexpr auto NPerBlock  = ck::wrapper::size<1>(tile_shape);
+    constexpr auto KPerBlock  = ck::wrapper::size<2>(tile_shape);
+    constexpr auto K1         = GemmTraits::K1;
+    constexpr auto K0PerBlock = KPerBlock / K1;
+    const auto K0             = ck::math::integer_divide_ceil(K, K1);
+
+    const auto tile_shape_k0_m_n_k1 = ck::make_tuple(K0PerBlock, MPerBlock, NPerBlock, K1);
+    // Create layouts for global memory
+    const auto a_global_layout =
+        ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
+    const auto b_global_layout =
+        ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
+    const auto c_global_layout =
+        ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
+    // Apply padding
+    auto a_padded_global_layout =
+        ApplyPadding<DoPadding>(a_global_layout, ck::make_tuple(MPerBlock, KPerBlock));
+    auto b_padded_global_layout =
+        ApplyPadding<DoPadding>(b_global_layout, ck::make_tuple(NPerBlock, KPerBlock));
+    auto c_padded_global_layout =
+        ApplyPadding<DoPadding>(c_global_layout, ck::make_tuple(MPerBlock, NPerBlock));
+    // Reshape from M,K to K0,M,K1
+    const auto reshaped_dims_idxs =
+        ck::make_tuple(ck::Number<1>{}, ck::make_tuple(ck::Number<0>{}, ck::Number<2>{}));
+    auto a_padded_unmerged_global_layout =
+        ck::wrapper::unmerge<1>(a_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
+    auto b_padded_unmerged_global_layout =
+        ck::wrapper::unmerge<1>(b_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
+    // Create tensors for global memory
+    auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<const DataType*>(p_a), a_padded_unmerged_global_layout);
+    auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<const DataType*>(p_b), b_padded_unmerged_global_layout);
+    auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<DataType*>(p_c), c_padded_global_layout);
+    // Create layouts and tensors for lds memory.
+    constexpr auto a_tile_layout = ck::wrapper::make_layout(
+        ck::make_tuple(K0PerBlock, MPerBlock, K1),
+        ck::make_tuple((MPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
+    constexpr auto b_tile_layout = ck::wrapper::make_layout(
+        ck::make_tuple(K0PerBlock, NPerBlock, K1),
+        ck::make_tuple((NPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
+
+    __shared__ DataType lds_a[ck::wrapper::size(a_tile_layout) + K0PerBlock];
+    __shared__ DataType lds_b[ck::wrapper::size(b_tile_layout) + K0PerBlock];
+
+    auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
+        static_cast<DataType*>(lds_a), a_tile_layout);
+    auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
+        static_cast<DataType*>(lds_b), b_tile_layout);
+
+    const auto block_idxs            = ck::make_tuple(ck::wrapper::slice(),
+                                           static_cast<ck::index_t>(blockIdx.x),
+                                           static_cast<ck::index_t>(blockIdx.y),
+                                           ck::wrapper::slice());
+    using DimAccessOrder             = ck::Tuple<ck::Number<1>, ck::Number<0>, ck::Number<2>>;
+    constexpr ck::index_t vector_dim = 2;
+
+    // Create tile and partition for C global memory. Use specific gemm
+    // functions to get appropriate layouts.
+    auto c_global_local_tile =
+        ck::wrapper::make_local_tile(c_global_tensor,
+                                     tile_shape_k0_m_n_k1,
+                                     block_idxs,
+                                     make_tuple(ck::wrapper::slice(K0PerBlock),
+                                                ck::Number<1>{},
+                                                ck::Number<1>{},
+                                                ck::wrapper::slice(K1)));
+    auto c_global_local_partition =
+        ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
+                                                               decltype(a_tile_layout),
+                                                               decltype(b_tile_layout),
+                                                               ck::wrapper::size(thread_layout),
+                                                               GemmTraits>(c_global_local_tile);
+    // Define and clear c vgpr register
+    auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
+                                                                  decltype(a_tile_layout),
+                                                                  decltype(b_tile_layout),
+                                                                  ck::wrapper::size(thread_layout),
+                                                                  GemmTraits>();
+    ck::wrapper::clear(c_vgpr_reg);
+    // Local partitions for lds memory
+    auto a_lds_tensor_local_partition =
+        ck::wrapper::make_local_partition(a_lds_tensor, thread_layout, threadIdx.x);
+    auto b_lds_tensor_local_partition =
+        ck::wrapper::make_local_partition(b_lds_tensor, thread_layout, threadIdx.x);
+    // Lamda to slice tensor, then create local tile and partition
+    auto make_global_partition = [&](auto tensor, auto projection, ck::index_t i) {
+        const auto k_slice =
+            ck::make_tuple(ck::wrapper::slice(i * K0PerBlock, (i + 1) * K0PerBlock),
+                           ck::wrapper::slice(),
+                           ck::wrapper::slice());
+        auto local_tile = ck::wrapper::make_local_tile(
+            tensor(k_slice), tile_shape_k0_m_n_k1, block_idxs, projection);
+        return ck::wrapper::make_local_partition(local_tile, thread_layout, threadIdx.x);
+    };
+
+    auto a_global_local_partition = make_global_partition(
+        a_global_tensor,
+        make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
+        0);
+    auto b_global_local_partition = make_global_partition(
+        b_global_tensor,
+        make_tuple(ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
+        0);
+
+    // (row-major vgpr layout)
+    auto a_vgpr_tensor =
+        ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
+            ck::wrapper::make_layout(
+                shape(a_global_local_partition),
+                ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
+                                   ck::wrapper::size<2>(a_global_local_partition),
+                               ck::wrapper::size<2>(a_global_local_partition),
+                               ck::Number<1>{})));
+    auto b_vgpr_tensor =
+        ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
+            ck::wrapper::make_layout(
+                shape(b_global_local_partition),
+                ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
+                                   ck::wrapper::size<2>(a_global_local_partition),
+                               ck::wrapper::size<2>(a_global_local_partition),
+                               ck::Number<1>{})));
+    // Copy first values to lds
+    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_global_local_partition,
+                                                                     a_vgpr_tensor);
+    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_global_local_partition,
+                                                                     b_vgpr_tensor);
+    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_vgpr_tensor,
+                                                                     a_lds_tensor_local_partition);
+    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_vgpr_tensor,
+                                                                     b_lds_tensor_local_partition);
+    // Pipeline loop
+    const ck::index_t num_loop =
+        __builtin_amdgcn_readfirstlane(ck::math::integer_divide_ceil(K, KPerBlock));
+    // Skip if only tile should be processed
+    if(num_loop > 1)
+    {
+        ck::index_t i = 0;
+        do
+        {
+            auto a_global_local_partition_i = make_global_partition(
+                a_global_tensor,
+                make_tuple(
+                    ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
+                i + 1);
+            auto b_global_local_partition_i = make_global_partition(
+                b_global_tensor,
+                make_tuple(
+                    ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
+                i + 1);
+            // Copy data to A vgpr.
+            ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
+                a_global_local_partition_i, a_vgpr_tensor);
+            // Synchronize.
+            ck::block_sync_lds();
+            // Copy data to B vgpr.
+            ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
+                b_global_local_partition_i, b_vgpr_tensor);
+            // Perform gemm.
+            ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
+                a_lds_tensor, b_lds_tensor, c_vgpr_reg);
+            // Synchronize
+            ck::block_sync_lds();
+            // Copy data to A and B lds tiles.
+            ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
+                a_vgpr_tensor, a_lds_tensor_local_partition);
+            ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
+                b_vgpr_tensor, b_lds_tensor_local_partition);
+
+            ++i;
+        } while(i < (num_loop - 1));
+    }
+    // Handle tail.
+    ck::block_sync_lds();
+    ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
+        a_lds_tensor, b_lds_tensor, c_vgpr_reg);
+    // Store data from C vgpr to C global memory.
+    ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
+}
+
+template <typename DataType,
+          typename GemmTraits,
+          ck::index_t scalar_per_vector,
+          bool DoPadding,
+          typename BlockShape,
+          typename ThreadLayout>
+void PerformGemm(const ck::index_t M,
+                 const ck::index_t N,
+                 const ck::index_t K,
+                 const BlockShape& tile_shape,
+                 const ThreadLayout& thread_layout)
+{
+    // Global memory buffers
+    SimpleDeviceMem a_mem(M * K * sizeof(DataType));
+    SimpleDeviceMem b_mem(K * N * sizeof(DataType));
+    SimpleDeviceMem c_mem(M * N * sizeof(DataType));
+
+    const ck::index_t grid_size_x =
+        ck::math::integer_divide_ceil(M, ck::wrapper::size<0>(tile_shape));
+    const ck::index_t grid_size_y =
+        ck::math::integer_divide_ceil(N, ck::wrapper::size<1>(tile_shape));
+
+    const auto kernel =
+        DeviceGemm<DataType, GemmTraits, scalar_per_vector, BlockShape, ThreadLayout, DoPadding>;
+    const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
+                                                  kernel,
+                                                  dim3(grid_size_x, grid_size_y, 1),
+                                                  dim3(ck::wrapper::size(thread_layout)),
+                                                  0,
+                                                  a_mem.GetDeviceBuffer(),
+                                                  b_mem.GetDeviceBuffer(),
+                                                  c_mem.GetDeviceBuffer(),
+                                                  M,
+                                                  N,
+                                                  K,
+                                                  tile_shape,
+                                                  thread_layout);
+    std::size_t flop     = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(DataType) * M * K + sizeof(DataType) * K * N + sizeof(DataType) * M * N;
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+              << gb_per_sec << " GB/s, " << std::endl;
+}
+
+int main(int argc, char* argv[])
+{
+    using DataType = ck::half_t;
+    const auto thread_layout =
+        ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
+                                 ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
+    const auto tile_shape = ck::make_tuple(ck::Number<256>{}, ck::Number<128>{}, ck::Number<32>{});
+    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_8K1, 8, false>(
+        3840, 4096, 4096, tile_shape, thread_layout);
+    return 0;
+}
+// MI300X Perf:   0.411552 ms, 313.081 TFlops, 234.403 GB/s,
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -48,7 +48,7 @@ else()
    endif()
 endif()

-find_package(composable_kernel COMPONENTS device_operations)
+find_package(composable_kernel COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_contraction_operations device_reduction_operations)
 find_package(hip REQUIRED PATHS /opt/rocm)
 message(STATUS "Build with HIP ${hip_VERSION}")


--- a/cmake/ClangTidy.cmake
+++ b/cmake/ClangTidy.cmake
@@ -149,7 +149,7 @@ function(clang_tidy_check TARGET)
            add_custom_target(${tidy_target}
                # for some targets clang-tidy not able to get information from .clang-tidy
                DEPENDS ${SOURCE}
-                COMMAND ${CLANG_TIDY_COMMAND} "-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__\; __HIP_ROCclr__\}\]\}" ${SOURCE} "-export-fixes=${CLANG_TIDY_FIXIT_DIR}/${TARGET}-${tidy_file}.yaml"
+                COMMAND ${CLANG_TIDY_COMMAND} "-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__\; __HIP_PLATFORM_AMD__\; __HIP_ROCclr__\}\]\}" ${SOURCE} "-export-fixes=${CLANG_TIDY_FIXIT_DIR}/${TARGET}-${tidy_file}.yaml"
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
                COMMENT "clang-tidy: Running clang-tidy on target ${SOURCE}..."
            )

--- a/cmake/DoxygenDoc.cmake
+++ b/cmake/DoxygenDoc.cmake
@@ -309,6 +309,8 @@ XML_OUTPUT
 XML_PROGRAMLISTING
 )

+set(WARN_AS_ERROR YES)
+
 set(DOXYGEN_CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/doxygen/doxygen.conf" CACHE PATH "Path to generated doxygen configuration file")

 function(add_doxygen_doc)

--- a/cmake/EnableCompilerWarnings.cmake
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -70,6 +70,7 @@ else()
            -Wno-option-ignored
            -Wsign-compare
            -Wno-extra-semi-stmt
+            -Wno-unused-template
        )
        if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
            list(APPEND CMAKE_COMPILER_WARNINGS

--- a/cmake/getopt.cmake
+++ b/cmake/getopt.cmake
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+add_library(getopt::getopt INTERFACE IMPORTED GLOBAL)
+
+if(WIN32)
+    include(FetchContent)
+
+    FetchContent_Declare(
+            getopt
+            GIT_REPOSITORY https://github.com/apwojcik/getopt.git
+            GIT_TAG main
+            SYSTEM
+        )
+
+    set(__build_shared_libs ${BUILD_SHARED_LIBS})
+    set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "")
+
+    FetchContent_MakeAvailable(getopt)
+
+    # Restore the old value of BUILD_SHARED_LIBS
+    set(BUILD_SHARED_LIBS ${__build_shared_libs} CACHE BOOL "Type of libraries to build" FORCE)
+
+    FetchContent_GetProperties(getopt)
+
+    target_link_libraries(getopt::getopt INTERFACE wingetopt)
+    target_include_directories(getopt::getopt INTERFACE ${getopt_SOURCE_DIR}/src)
+endif()
\ No newline at end of file
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -6,9 +6,42 @@ if(GOOGLETEST_DIR)
  set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override")
 endif()

-message(STATUS "Fetching GoogleTest")
+FetchContent_Declare(
+    GTest
+    GIT_REPOSITORY https://github.com/google/googletest.git
+    GIT_TAG f8d7d77c06936315286eb55f8de22cd23c188571
+)
+
+# Suppress ROCMChecks WARNING on GoogleTests
+set(ROCM_DISABLE_CHECKS FALSE)
+macro(rocm_check_toolchain_var var access value list_file)
+    if(NOT ROCM_DISABLE_CHECKS)
+        _rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}")
+    endif()
+endmacro()
+
+if(WIN32)
+    set(gtest_force_shared_crt ON CACHE_INTERNAL "")
+endif()
+
+set(BUILD_GMOCK OFF CACHE INTERNAL "")
+set(INSTALL_GTEST OFF CACHE INTERNAL "")
+
+# Store the current value of BUILD_SHARED_LIBS
+set(__build_shared_libs ${BUILD_SHARED_LIBS})
+set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "")

-list(APPEND GTEST_CMAKE_CXX_FLAGS
+set(ROCM_DISABLE_CHECKS TRUE)
+FetchContent_MakeAvailable(GTest)
+set(ROCM_DISABLE_CHECKS FALSE)
+
+# Restore the old value of BUILD_SHARED_LIBS
+set(BUILD_SHARED_LIBS ${__build_shared_libs} CACHE BOOL "Type of libraries to build" FORCE)
+
+set(BUILD_GMOCK OFF CACHE INTERNAL "")
+set(INSTALL_GTEST OFF CACHE INTERNAL "")
+
+set(GTEST_CXX_FLAGS
     -Wno-undef
     -Wno-reserved-identifier
     -Wno-global-constructors
@@ -22,29 +55,16 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
     -Wno-old-style-cast
     -Wno-deprecated
     -Wno-unsafe-buffer-usage
+     -Wno-float-equal
 )
-message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")

-FetchContent_Declare(
-  googletest
-  GIT_REPOSITORY https://github.com/google/googletest.git
-  GIT_TAG        b85864c64758dec007208e56af933fc3f52044ee
-)
-
-# Will be necessary for windows build
-# set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-FetchContent_GetProperties(googletest)
-if(NOT googletest_POPULATED)
-  FetchContent_Populate(googletest)
-  add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL)
+if(WIN32)
+    list(APPEND GTEST_CXX_FLAGS
+            -Wno-suggest-destructor-override
+            -Wno-suggest-override
+            -Wno-nonportable-system-include-path
+            -Wno-language-extension-token)
 endif()

-target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
-target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
-target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
-target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
-
-set_target_properties(gtest PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(gtest_main PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(gmock PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(gmock_main PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_compile_options(gtest PRIVATE ${GTEST_CXX_FLAGS})
+target_compile_options(gtest_main PRIVATE ${GTEST_CXX_FLAGS})
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
-ROCmSoftwarePlatform/rocm-recipes
-RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build
-danmar/cppcheck@2.9
\ No newline at end of file
+ROCm/rocm-recipes
+ROCm/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build
+danmar/cppcheck@2.9
--- a/docs/API_Reference_Guide.rst
+++ b/docs/API_Reference_Guide.rst
+.. meta::
+  :description: Composable Kernel documentation and API reference library
+  :keywords: composable kernel, CK, ROCm, API, documentation

-*******************
-API Reference Guide
-*******************
+.. _api-reference:
+
+********************************************************************
+API reference guide
+********************************************************************

-=================
-Introduction
-=================

 This document contains details of the APIs for the Composable Kernel (CK) library and introduces
 some of the key design principles that are used to write new classes that extend CK functionality.
@@ -30,7 +32,7 @@ DeviceMem
 Kernels For Flashattention
 ---------------------------

-The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists
+The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This section lists
 the classes that are used in the CK GPU implementation of Flashattention.

 **Gridwise classes**

--- a/docs/Contributors_Guide.rst
+++ b/docs/Contributors_Guide.rst
+.. meta::
+  :description: Composable Kernel documentation and API reference library
+  :keywords: composable kernel, CK, ROCm, API, documentation
+
+.. _contributing-to:
+
+********************************************************************
+Contributor's guide
+********************************************************************
+
+This chapter explains the rules for contributing to the Composable Kernel project, and how to contribute.
+
+Getting started
+===============
+
+#. **Documentation:** Before contributing to the library, familiarize yourself with the
+   `Composable Kernel User Guide <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_.
+   It provides insight into the core concepts, environment configuration, and steps to obtain or
+   build the library. You can also find some of this information in the
+   `README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
+   on the project's GitHub page.
+#. **Additional reading:** The blog post `AMD Composable Kernel library: efficient fused kernels for AI apps with just a few lines of code <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_ provides a deeper understanding of the CK library and showcases its performance capabilities.
+   <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_
+   from the AMD Community portal. It offers a deeper understanding of the library's objectives and showcases its performance capabilities.
+#. **General information:** For broader information about AMD products, consider exploring the
+   `AMD Developer Central portal <https://www.amd.com/en/developer.html>`_.
+
+How to contribute
 ===================
-Contributor's Guide
-===================

-Pull-request guidelines
-=======================
+You can make an impact by reporting issues or proposing code enhancements through pull requests.
+
+Reporting issues
+----------------
+
+Use `Github issues <https://github.com/ROCm/composable_kernel/issues>`_
+to track public bugs and enhancement requests.
+
+If you encounter an issue with the library, please check if the problem has already been
+reported by searching existing issues on GitHub. If your issue seems unique, please submit a new
+issue. All reported issues must include:
+
+* A comprehensive description of the problem, including:
+
+  * What did you observe?
+  * Why do you think it is a bug (if it seems like one)?
+  * What did you expect to happen? What would indicate the resolution of the problem?
+  * Are there any known workarounds?
+
+* Your configuration details, including:
+
+  * Which GPU are you using?
+  * Which OS version are you on?
+  * Which ROCm version are you using?
+  * Are you using a Docker image? If so, which one?
+
+* Steps to reproduce the issue, including:
+
+  * What actions trigger the issue? What are the reproduction steps?
+
+    * If you build the library from scratch, what CMake command did you use?
+
+  * How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue?
+
+Before submitting any issue, ensure you have addressed all relevant questions from the checklist.
+
+Creating Pull Requests
+----------------------
+
+You can submit `Pull Requests (PR) on GitHub
+<https://github.com/ROCm/composable_kernel/pulls>`_.
+
+All contributors are required to develop their changes on a separate branch and then create a
+pull request to merge their changes into the `develop` branch, which is the default
+development branch in the Composable Kernel project. All external contributors must use their own
+forks of the project to develop their changes.
+
+When submitting a Pull Request you should:
+
+* Describe the change providing information about the motivation for the change and a general
+  description of all code modifications.
+
+* Verify and test the change:
+
+  * Run any relevant existing tests.
+  * Write new tests if added functionality is not covered by current tests.
+
+* Ensure your changes align with the coding style defined in the ``.clang-format`` file located in
+  the project's root directory. We leverage `pre-commit` to run `clang-format` automatically. We
+  highly recommend contributors utilize this method to maintain consistent code formatting.
+  Instructions on setting up `pre-commit` can be found in the project's
+  `README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
+
+* Link your PR to any related issues:
+
+  * If there is an issue that is resolved by your change, please provide a link to the issue in
+    the description of your pull request.
+
+* For larger contributions, structure your change into a sequence of smaller, focused commits, each
+  addressing a particular aspect or fix.
+
+Following the above guidelines ensures a seamless review process and faster assistance from our
+end.

-[TODO]
+Thank you for your commitment to enhancing the Composable Kernel project! 
--- a/docs/Supported_Primitives_Guide.rst
+++ b/docs/Supported_Primitives_Guide.rst
-==========================
+.. meta::
+  :description: Composable Kernel documentation and API reference library
+  :keywords: composable kernel, CK, ROCm, API, documentation
+
+.. _supported-primitives:
+
+********************************************************************
 Supported Primitives Guide
-==========================
+********************************************************************

-This document contains details of supported primitives in Composable Kernel (CK). In contrast to the
-API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins
-the algorithms implemented in CK.
+This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.

 ------------
 Softmax
 ------------

-For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the
+For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` you can decompose the
 softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,

 .. math::
@@ -27,7 +31,7 @@ where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and
 :math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar.

 For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size
-:math:`B_r \times B_c` we can compute the row-wise softmax as follows.
+:math:`B_r \times B_c` you can compute the row-wise softmax as follows.

 For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate,


--- a/docs/conf.py
+++ b/docs/conf.py
@@ -4,23 +4,34 @@
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html

-import subprocess
+import re

 from rocm_docs import ROCmDocs

+html_theme_options = {"flavor": "list"}

-name = "Composable Kernel"
-get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt'
-version = subprocess.getoutput(get_version)
-if len(version) > 0:
-    name = f"{name} {version}"
+with open('../CMakeLists.txt', encoding='utf-8') as f:
+    match = re.search(r'.*set\(version ([0-9.]+)[^0-9.]+', f.read())
+    if not match:
+        raise ValueError("VERSION not found!")
+    version_number = match[1]
+left_nav_title = f"Composable Kernel {version_number} Documentation"
+
+# for PDF output on Read the Docs
+project = "Composable Kernel Documentation"
+author = "Advanced Micro Devices, Inc."
+copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
+version = version_number
+release = version_number

 external_toc_path = "./sphinx/_toc.yml"

-docs_core = ROCmDocs(f"{name} Documentation")
-docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml")
+docs_core = ROCmDocs(left_nav_title)
+docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
 docs_core.setup()

+external_projects_current_project = "composable_kernel"
+
 mathjax3_config = {
 'tex': {
    'macros': {

--- a/docs/dockerhub.rst
+++ b/docs/dockerhub.rst
-===================
+.. meta::
+  :description: Composable Kernel documentation and API reference library
+  :keywords: composable kernel, CK, ROCm, API, documentation
+
+.. _docker-hub:
+
+********************************************************************
 CK Docker Hub
-===================
+********************************************************************

-------------------------------------
 Why do I need this?
-------------------------------------
+===================
+
+To make things simpler, and bring Composable Kernel and its dependencies together, 
+docker images can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_. Docker images provide a complete image of the OS, the Composable Kernel library, and its dependencies in a single downloadable file. 

-To make our lives easier and bring Composable Kernel dependencies together, we recommend using
-docker images that can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel>`_.
+Refer to `Docker Overview <https://docs.docker.com/get-started/overview/>`_ for more information on Docker images and containers.

-------------------------------------
-So what is Composable Kernel?
-------------------------------------
+Which image is right for me?
+============================
+
+The image naming includes information related to the docker image. 
+For example ``ck_ub20.04_rocm6.0`` indicates the following:
+
+* ``ck`` - made for running Composable Kernel;
+* ``ub20.04`` - based on Ubuntu 20.04;
+* ``rocm6.0`` - ROCm platform version 6.0.

-Composable Kernel (CK) library aims to provide a programming model for writing performance critical
-kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc,
-through general purpose kernel languages, like HIP C++.
+Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point. Use the ``docker pull`` command to download the file::

-To get the CK library::
+    docker pull rocm/composable_kernel:ck_ub20.04_rocm6.0

-    git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git

+What is inside the image?
+-------------------------

-run a docker container::
+The docker images have everything you need for running CK including:
+
+* `ROCm <https://www.amd.com/en/graphics/servers-solutions-rocm>`_
+* `CMake <https://cmake.org/getting-started/>`_
+* `Compiler <https://github.com/ROCm/llvm-project>`_
+* `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_
+
+Running the docker container
+============================
+
+After downloading the docker image, you can start the container using one of a number of commands. Start with the ``docker run`` command as shown below::

    docker run                                                            \
    -it                                                                   \
@@ -30,70 +52,50 @@ run a docker container::
    --group-add sudo                                                      \
    -w /root/workspace                                                    \
    -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
-    rocm/composable_kernel:ck_ub20.04_rocm5.6                             \
+    rocm/composable_kernel:ck_ub20.04_rocm6.0                             \
    /bin/bash

-and build the CK::
+After starting the bash shell, the docker container current folder is `~/workspace`. The library path is ``~/workspace/composable_kernel``. Navigate to the library to begin the tutorial as explained in :ref:`hello-world`:

-    mkdir build && cd build
-    # Need to specify target ID, example below is for gfx908 and gfx90a
-    cmake                                                                                             \
-    -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
-    -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
-    -D CMAKE_CXX_FLAGS="-O3"                                                                          \
-    -D CMAKE_BUILD_TYPE=Release                                                                       \
-    -D GPU_TARGETS="gfx908;gfx90a"                                                                    \
-    ..
+.. note::

-and::
+    If your current folder is different from `${HOME}`, adjust the line ``-v ${HOME}:/root/workspace`` in the ``docker run`` command to fit your folder structure.

-    make -j examples tests
+Stop and restart the docker image
+=================================

-To run all the test cases including tests and examples run::
+After finishing the tutorial, or just when you have completed your work session, you can close the docker container, or stop the docker container to restart it at another time. Closing the docker container means that it is still in the active state, and can be resumed from where you left it. Stopping the container closes it, and returns the image to its initial state. 

-    make test
+Use the ``Ctrl-D`` option to exit the container, while leaving it active, so you can return to the container in its current state to resume the tutorial, or pickup your project where you left off. 

-We can also run specific examples or tests like::
+To restart the active container use the ``docker exec`` command to specify the container name and options as follows::

-    ./bin/example_gemm_xdl_fp16
-    ./bin/test_gemm_fp16
+    docker exec -it <container_name> bash

-For more details visit `CK github repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_,
-`CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example)>`_,
-`even more CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example>`_.
+Where: 

-------------------------------------
-And what is inside?
-------------------------------------
+* `exec` is the docker command
+* `-it` is the interactive option for `exec`
+* `<container_name>` specifies an active container on the system
+* `bash` specifies the command to run in the interactive shell

-The docker images have everything you need for running CK including:
+.. note::

-* `ROCm <https://www.amd.com/en/graphics/servers-solutions-rocm>`_
-* `CMake <https://cmake.org/>`_
-* `Compiler <https://github.com/RadeonOpenCompute/llvm-project>`_
+    You can use the ``docker container ls`` command to list the active containers on the system.

-------------------------------------
-Which image is right for me?
-------------------------------------
-
-Let's take a look at the image naming, for example ``ck_ub20.04_rocm5.6``. The image specs are:
-
-* ``ck`` - made for running Composable Kernel;
-* ``ub20.04`` - based on Ubuntu 20.04;
-* ``rocm5.6`` - ROCm platform version 5.6.
+To start a container from the image, use the ``docker start`` command::

-So just pick the right image for your project dependencies and you're all set.
+    docker start <container_name>

-------------------------------------
-DIY starts here
-------------------------------------
+Then use the docker exec command as shown above to start the bash shell. 

-If you need to customize a docker image or just can't stop tinkering, feel free to adjust the
-`Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_
-for your needs.
+Use the ``docker stop`` command to stop the container and restore the image to its initial state::

-------------------------------------
-License
-------------------------------------
+    docker stop <container_name>
+    
+Editing the docker image
+=======================

-CK is released under the MIT `license <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/LICENSE>`_.
+If you want to customize the docker image, edit the
+`Dockerfile <https://github.com/ROCm/composable_kernel/blob/develop/Dockerfile>`_
+from the GitHub repository to suit your needs.
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -58,7 +58,7 @@ PROJECT_LOGO           =
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.

-OUTPUT_DIRECTORY       = docBin
+OUTPUT_DIRECTORY       = .

 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -778,7 +778,9 @@ WARN_LOGFILE           =
 INPUT                  = ../../include/ck/tensor_operation/gpu/grid \
                         ../../include/ck/tensor_operation/gpu/block \
                         ../../include/ck/tensor_operation/gpu/thread \
-                         ../../library/include/ck/library/utility
+                         ../../library/include/ck/library/utility \
+                         ../../include/ck/wrapper
+

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

--- a/docs/index.rst
+++ b/docs/index.rst
-============================
+.. meta::
+  :description: Composable Kernel documentation and API reference library
+  :keywords: composable kernel, CK, ROCm, API, documentation
+
+.. _composable-kernel:
+
+********************************************************************
 Composable Kernel User Guide
-============================
+********************************************************************
+
+The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++. This document contains instructions for installing, using, and contributing to the Composable Kernel project. To learn more see :ref:`what-is-ck`.

------------
-Introduction
------------
+The CK documentation is structured as follows:

-This document contains instructions for installing, using, and contributing to Composable Kernel (CK).
+.. card:: Conceptual

-----------
-Methodology
-----------
+  * :ref:`what-is-ck`

-Composable Kernel (CK) library aims to provide a programming model for writing performance critical
-kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc,
-through general purpose kernel languages, like HIP C++.
+.. card:: Installation

-CK utilizes two concepts to achieve performance portability and code maintainability:
+  * :ref:`docker-hub`

-* A tile-based programming model
-* Algorithm complexity reduction for complex ML operators, using innovative technique we call
-  "Tensor Coordinate Transformation".
+.. card:: Tutorial

-.. image:: data/ck_component.png
-   :alt: CK Components
+  * :ref:`hello-world`

--------------
-Code Structure
--------------
+.. card:: API reference

-Current CK library are structured into 4 layers:
+  * :ref:`supported-primitives`
+  * :ref:`api-reference`
+  * :ref:`wrapper`

-* "Templated Tile Operators" layer
-* "Templated Kernel and Invoker" layer
-* "Instantiated Kernel and Invoker" layer
-* "Client API" layer
+.. card:: Contributing to CK

-.. image:: data/ck_layer.png
-   :alt: CK Layers
-   
-Documentation Roadmap
-^^^^^^^^^^^^^^^^^^^^^
-The following is a list of CK documents in the suggested reading order:
+  * :ref:`contributing-to`

-.. toctree::
-   :maxdepth: 5
-   :caption: Contents:
-   :numbered:
+To contribute to the documentation refer to `Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/index.html>`_.

-   tutorial_hello_world
-   dockerhub
-   Supported_Primitives_Guide
-   API_Reference_Guide
-   Contributors_Guide
+You can find licensing information on the `Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
--- a/docs/license.md
+++ b/docs/license.md
+```{include} ../LICENSE.md
+```
--- a/docs/license.rst
+++ b/docs/license.rst
-=======
-License
-=======
-
-.. include:: ../LICENSE
-   :literal:
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
-# Anywhere {branch} is used, the branch name will be substituted.
-# These comments will also be removed.
 defaults:
  numbered: False
-  maxdepth: 6
 root: index
 subtrees:
-  - caption: About
-    entries:
-      - file: license
+- entries:
+  - file: what-is-ck.rst
+    title: What is Composable Kernel?
+  - file: dockerhub.rst
+    title: Docker Hub
+  - file: tutorial_hello_world.rst
+    title: Hello World Tutorial
+  - file: Supported_Primitives_Guide.rst
+    title: Supported Primitives
+  - file: API_Reference_Guide.rst
+    title: API Reference
+  - file: wrapper.rst
+    title: Wrapper
+  - file: Contributors_Guide.rst
+    title: Contributing to CK
+  - file: license.md
+    title: License