Merge branch 'develop' into feature/integrage-karg-simplification-pr

853e797e · Po Yen Chen · GitHub · 4ddee80b · ac9e01e2 · 853e797e
Unverified Commit 853e797e authored May 25, 2023 by Po Yen Chen Committed by GitHub May 25, 2023
20 changed files
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -6,7 +6,7 @@
 version: 2
 updates:
  - package-ecosystem: "pip" # See documentation for possible values
-    directory: "/docs/.sphinx" # Location of package manifests
+    directory: "/docs/sphinx" # Location of package manifests
    open-pull-requests-limit: 10
    schedule:
      interval: "daily"
--- a/.gitignore
+++ b/.gitignore
@@ -49,10 +49,10 @@ build*
 install.dir*
 # documentation artifacts
-build/
 _build/
 _images/
 _static/
 _templates/
 _toc.yml
 docBin/
+_doxygen/
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -11,8 +11,8 @@ build:
 sphinx:
   configuration: docs/conf.py
-formats: [htmlzip]
+formats: [htmlzip, pdf, epub]
 python:
   install:
-   - requirements: docs/.sphinx/requirements.txt
+   - requirements: docs/sphinx/requirements.txt
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -493,10 +493,11 @@ def Build_CK(Map conf=[:]){
                {
                    cmake_build(conf)
                    dir("build"){
+                        //run tests and examples 	
+                        sh 'make -j\$(( \$(nproc) / 2 )) check'
                        if (navi_node == 0 ){
-                           //run tests and examples on all nodes except Navi
+                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
-                           sh 'make -j check'
+                            //do not stash profiler on Navi nodes
-                           //we only need the ckProfiler to run the performance tests, so we pack and stash it
                           sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
                           stash "ckProfiler.tar.gz"
                        }
@@ -686,12 +687,31 @@ pipeline {
        {
            parallel
            {
+                stage("Build CK and run Tests on MI100/MI200/MI300")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FULL_QA.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx908 || gfx90a") }
+                    environment{
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                    }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                    }
+                }
                stage("Build CK and run Tests on MI100/MI200")
                {
+                    when {
+                        beforeAgent true
+                        expression { !params.RUN_FULL_QA.toBoolean() }
+                    }
                    agent{ label rocmnode("gfx908 || gfx90a") }
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" """
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908,gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ 
                    }
                    steps{
                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
@@ -705,8 +725,8 @@ pipeline {
                    }
                    agent{ label rocmnode("navi21") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install """ 
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" """ 
-                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030;gfx1100;gfx1101;gfx1102" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
+                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
                    }
                    steps{

--- a/README.md
+++ b/README.md
 # Composable Kernel
 ## Methodology
 Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
 CK utilizes two concepts to achieve performance portability and code maintainability:
@@ -10,6 +11,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi
 ![ALT](/docs/data/ck_component.png "CK Components")
 ## Code Structure
 Current CK library are structured into 4 layers:
 * "Templated Tile Operators" layer
 * "Templated Kernel and Invoker" layer
@@ -24,30 +26,35 @@ Run the steps below to build documentation locally.
 ```
 cd docs
-pip3 install -r .sphinx/requirements.txt
+pip3 install -r sphinx/requirements.txt
 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```
 ## Contributors
 The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
 ## Citation
 If you use CK, please use following citations:
 * CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
 * [CITATION.cff](/CITATION.cff)
 ## License
 CK is released under the MIT license. [License File](/LICENSE)
 # Build CK
 ## Build docker image
 ```bash
 DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
 ```
 ## Launch docker
 ```bash
 docker run                                     \
 -it                                            \
@@ -60,10 +67,12 @@ ck:latest                                      \
 ```
 ## Build CK
 ```bash
 mkdir build && cd build
 # Need to specify target ID, example below is for gfx908 and gfx90a
 cmake                                                                                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
@@ -74,6 +83,7 @@ cmake
 ```
 ### Build examples and tests
 ```bash
 make -j examples tests
 make test
@@ -83,21 +93,25 @@ Instructions for running each individual examples are under [example](/example)
 ## Build ckProfiler
 ```bash
 make -j ckProfiler
 ```
 Instructions for running ckProfiler are under [profiler](/profiler)
 ## Install CK
 ```bash
 make install
 ```
 ## Using CK as pre-built kernel library
 Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
 ## Caveat
 ### Kernel Timing and Verification
 CK's own kernel timer will warn up kernel once, and then run it multiple times
 to get average kernel time. For some kernels that use atomic add, this will cause
 output buffer to be accumulated multiple times, causing verification failure.

--- a/client_example/18_groupnorm/groupnorm_swish.cpp
+++ b/client_example/18_groupnorm/groupnorm_swish.cpp
@@ -131,11 +131,12 @@ int main(int argc, char* argv[])
        }
    }
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
-              << best_op_name << std::endl;
    // run the best intance
+    if(found)
    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
        auto& op_ptr = op_ptrs[best_op_id];
        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                  << std::endl;

--- a/client_example/19_pool_fwd/CMakeLists.txt
+++ b/client_example/19_pool_fwd/CMakeLists.txt
+add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp)
+target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_operations)
+add_executable(client_avg_pool3d_fwd avg_pool3d_fwd.cpp)
+target_link_libraries(client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations)
\ No newline at end of file
--- a/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
+++ b/client_example/19_pool_fwd/avg_pool3d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
+using InDataType    = ck::half_t;
+using OutDataType   = ck::half_t;
+using IndexDataType = int32_t;
+constexpr ck::index_t InOutRank  = 5;
+constexpr ck::index_t WindowRank = 3;
+#if 0
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::MAX;
+constexpr bool OutputIndex = false;
+#else
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::AVG;
+constexpr bool OutputIndex = false;
+#endif
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    ck::index_t N               = 2;
+    ck::index_t C               = 32;
+    ck::index_t Z               = 2;
+    ck::index_t Y               = 2;
+    ck::index_t X               = 2;
+    ck::index_t Di              = 30;
+    ck::index_t Hi              = 30;
+    ck::index_t Wi              = 30;
+    ck::index_t window_stride_d = 2;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_d   = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_d  = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+    ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Z) / window_stride_d + 1;
+    ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+    // Pool API only support the order of NCDHW
+    std::vector<ck::index_t> in_length              = {N, C, Di, Hi, Wi};
+    std::vector<ck::index_t> out_length             = {N, C, Do, Ho, Wo};
+    std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
+    std::vector<ck::index_t> window_strides   = {window_stride_d, window_stride_h, window_stride_w};
+    std::vector<ck::index_t> input_left_pads  = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
+    std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
+    std::size_t in_tensor_size  = N * C * Di * Hi * Wi;
+    std::size_t out_tensor_size = N * C * Do * Ho * Wo;
+    // tensor layout = NDHWC
+    std::vector<ck::index_t> in_tensor_stride  = {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C};
+    std::vector<ck::index_t> out_tensor_stride = {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C};
+    SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
+    SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
+    SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3, 4});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t num_bytes =
+                in_tensor_size * sizeof(InDataType) + out_tensor_size * sizeof(OutDataType);
+            if constexpr(OutputIndex)
+                num_bytes += out_tensor_size * sizeof(IndexDataType);
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3, 4});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/19_pool_fwd/max_pool2d_fwd.cpp
+++ b/client_example/19_pool_fwd/max_pool2d_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp"
+using InDataType    = ck::half_t;
+using OutDataType   = ck::half_t;
+using IndexDataType = int32_t;
+constexpr ck::index_t InOutRank  = 4;
+constexpr ck::index_t WindowRank = 2;
+#if 1
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::MAX;
+constexpr bool OutputIndex = true;
+#else
+constexpr auto ReduceOpId  = ck::ReduceTensorOp::AVG;
+constexpr bool OutputIndex = false;
+#endif
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    ck::index_t N               = 2;
+    ck::index_t C               = 32;
+    ck::index_t Y               = 2;
+    ck::index_t X               = 2;
+    ck::index_t Hi              = 30;
+    ck::index_t Wi              = 30;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+    ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+    // Pool API only support the order of NCHW
+    std::vector<ck::index_t> in_length              = {N, C, Hi, Wi};
+    std::vector<ck::index_t> out_length             = {N, C, Ho, Wo};
+    std::vector<ck::index_t> window_spatial_lengths = {Y, X};
+    std::vector<ck::index_t> window_strides         = {window_stride_h, window_stride_w};
+    std::vector<ck::index_t> input_left_pads        = {in_left_pad_h, in_left_pad_w};
+    std::vector<ck::index_t> input_right_pads       = {in_right_pad_h, in_right_pad_w};
+    std::size_t in_tensor_size  = N * C * Hi * Wi;
+    std::size_t out_tensor_size = N * C * Ho * Wo;
+    // tensor layout = NHWC
+    std::vector<ck::index_t> in_tensor_stride  = {C * Hi * Wi, 1, Wi * C, C};
+    std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
+    SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
+    SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
+    SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
+    using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
+                                                                 WindowRank,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 IndexDataType,
+                                                                 ReduceOpId,
+                                                                 OutputIndex>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t num_bytes =
+                in_tensor_size * sizeof(InDataType) + out_tensor_size * sizeof(OutDataType);
+            if constexpr(OutputIndex)
+                num_bytes += out_tensor_size * sizeof(IndexDataType);
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    // run the best intance
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+            in_length,
+            window_spatial_lengths,
+            out_length,
+            in_tensor_stride,
+            out_tensor_stride,
+            out_tensor_stride,
+            window_strides,
+            input_left_pads,
+            input_right_pads,
+            {2, 3});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/docs/.sphinx/_toc.yml.in
+++ b/docs/.sphinx/_toc.yml.in
-root: index
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -4,10 +4,21 @@
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
+import subprocess
 from rocm_docs import ROCmDocs
-docs_core = ROCmDocs("Composable Kernel Documentation")
-docs_core.run_doxygen()
+name = "Composable Kernel"
+get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt'
+version = subprocess.getoutput(get_version)
+if len(version) > 0:
+    name = f"{name} {version}"
+external_toc_path = "./sphinx/_toc.yml"
+docs_core = ROCmDocs(f"{name} Documentation")
+docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml")
 docs_core.setup()
 mathjax3_config = {

--- a/docs/.doxygen/Doxyfile
+++ b/docs/.doxygen/Doxyfile
--- a/docs/license.rst
+++ b/docs/license.rst
+=======
+License
+=======
+.. include:: ../LICENSE
+   :literal:
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
+# Anywhere {branch} is used, the branch name will be substituted.
+# These comments will also be removed.
+defaults:
+  numbered: False
+  maxdepth: 6
+root: index
+subtrees:
+  - caption: About
+    entries:
+      - file: license
--- a/docs/.sphinx/requirements.in
+++ b/docs/.sphinx/requirements.in
-rocm-docs-core==0.2.0
+rocm-docs-core==0.10.3
 sphinxcontrib-bibtex==2.5.0
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile .sphinx/requirements.in
+#    pip-compile requirements.in
 #
 accessible-pygments==0.0.3
    # via pydata-sphinx-theme
 alabaster==0.7.13
    # via sphinx
-asttokens==2.2.1
-    # via stack-data
-attrs==22.2.0
-    # via
-    #   jsonschema
-    #   jupyter-cache
 babel==2.12.1
    # via
    #   pydata-sphinx-theme
    #   sphinx
-backcall==0.2.0
-    # via ipython
 beautifulsoup4==4.11.2
    # via pydata-sphinx-theme
 breathe==4.34.0
@@ -27,19 +19,15 @@ breathe==4.34.0
 certifi==2022.12.7
    # via requests
 cffi==1.15.1
-    # via pynacl
+    # via
+    #   cryptography
+    #   pynacl
 charset-normalizer==3.1.0
    # via requests
 click==8.1.3
-    # via
+    # via sphinx-external-toc
-    #   jupyter-cache
+cryptography==40.0.2
-    #   sphinx-external-toc
+    # via pyjwt
-comm==0.1.2
-    # via ipykernel
-debugpy==1.6.6
-    # via ipykernel
-decorator==5.1.1
-    # via ipython
 deprecated==1.2.13
    # via pygithub
 docutils==0.16
@@ -48,52 +36,26 @@ docutils==0.16
    #   myst-parser
    #   pybtex-docutils
    #   pydata-sphinx-theme
-    #   rocm-docs-core
    #   sphinx
    #   sphinxcontrib-bibtex
-executing==1.2.0
-    # via stack-data
-fastjsonschema==2.16.3
-    # via nbformat
 gitdb==4.0.10
    # via gitpython
 gitpython==3.1.31
    # via rocm-docs-core
-greenlet==2.0.2
-    # via sqlalchemy
 idna==3.4
    # via requests
 imagesize==1.4.1
    # via sphinx
 importlib-metadata==6.0.0
    # via
-    #   jupyter-cache
+    #   sphinx
-    #   myst-nb
+    #   sphinxcontrib-bibtex
-ipykernel==6.21.3
+importlib-resources==5.12.0
-    # via myst-nb
+    # via rocm-docs-core
-ipython==8.11.0
-    # via
-    #   ipykernel
-    #   myst-nb
-jedi==0.18.2
-    # via ipython
 jinja2==3.1.2
    # via
    #   myst-parser
    #   sphinx
-jsonschema==4.17.3
-    # via nbformat
-jupyter-cache==0.5.0
-    # via myst-nb
-jupyter-client==8.0.3
-    # via
-    #   ipykernel
-    #   nbclient
-jupyter-core==5.3.0
-    # via
-    #   ipykernel
-    #   jupyter-client
-    #   nbformat
 latexcodec==2.0.1
    # via pybtex
 linkify-it-py==1.0.3
@@ -104,54 +66,16 @@ markdown-it-py==2.2.0
    #   myst-parser
 markupsafe==2.1.2
    # via jinja2
-matplotlib-inline==0.1.6
-    # via
-    #   ipykernel
-    #   ipython
 mdit-py-plugins==0.3.5
    # via myst-parser
 mdurl==0.1.2
    # via markdown-it-py
-myst-nb==0.17.1
+myst-parser[linkify]==1.0.0
    # via rocm-docs-core
-myst-parser[linkify]==0.18.1
-    # via
-    #   myst-nb
-    #   rocm-docs-core
-nbclient==0.5.13
-    # via
-    #   jupyter-cache
-    #   myst-nb
-nbformat==5.7.3
-    # via
-    #   jupyter-cache
-    #   myst-nb
-    #   nbclient
-nest-asyncio==1.5.6
-    # via
-    #   ipykernel
-    #   nbclient
 packaging==23.0
    # via
-    #   ipykernel
    #   pydata-sphinx-theme
    #   sphinx
-parso==0.8.3
-    # via jedi
-pexpect==4.8.0
-    # via ipython
-pickleshare==0.7.5
-    # via ipython
-platformdirs==3.1.1
-    # via jupyter-core
-prompt-toolkit==3.0.38
-    # via ipython
-psutil==5.9.4
-    # via ipykernel
-ptyprocess==0.7.0
-    # via pexpect
-pure-eval==0.2.2
-    # via stack-data
 pybtex==0.24.0
    # via
    #   pybtex-docutils
@@ -160,57 +84,47 @@ pybtex-docutils==1.0.2
    # via sphinxcontrib-bibtex
 pycparser==2.21
    # via cffi
-pydata-sphinx-theme==0.13.1
+pydata-sphinx-theme==0.13.3
-    # via sphinx-book-theme
+    # via
-pygithub==1.57
+    #   rocm-docs-core
+    #   sphinx-book-theme
+pygithub==1.58.2
    # via rocm-docs-core
 pygments==2.14.0
    # via
    #   accessible-pygments
-    #   ipython
    #   pydata-sphinx-theme
    #   sphinx
-pyjwt==2.6.0
+pyjwt[crypto]==2.6.0
    # via pygithub
 pynacl==1.5.0
    # via pygithub
-pyrsistent==0.19.3
+pytz==2023.3
-    # via jsonschema
+    # via babel
-python-dateutil==2.8.2
-    # via jupyter-client
 pyyaml==6.0
    # via
-    #   jupyter-cache
-    #   myst-nb
    #   myst-parser
    #   pybtex
    #   sphinx-external-toc
-pyzmq==25.0.1
-    # via
-    #   ipykernel
-    #   jupyter-client
 requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.2.0
+rocm-docs-core==0.10.3
-    # via -r .sphinx/requirements.in
+    # via -r requirements.in
 six==1.16.0
    # via
-    #   asttokens
    #   latexcodec
    #   pybtex
-    #   python-dateutil
 smmap==5.0.0
    # via gitdb
 snowballstemmer==2.2.0
    # via sphinx
 soupsieve==2.4
    # via beautifulsoup4
-sphinx==4.3.1
+sphinx==5.3.0
    # via
    #   breathe
-    #   myst-nb
    #   myst-parser
    #   pydata-sphinx-theme
    #   rocm-docs-core
@@ -220,7 +134,7 @@ sphinx==4.3.1
    #   sphinx-external-toc
    #   sphinx-notfound-page
    #   sphinxcontrib-bibtex
-sphinx-book-theme==1.0.0rc2
+sphinx-book-theme==1.0.1
    # via rocm-docs-core
 sphinx-copybutton==0.5.1
    # via rocm-docs-core
@@ -233,7 +147,7 @@ sphinx-notfound-page==0.8.3
 sphinxcontrib-applehelp==1.0.4
    # via sphinx
 sphinxcontrib-bibtex==2.5.0
-    # via -r .sphinx/requirements.in
+    # via -r requirements.in
 sphinxcontrib-devhelp==1.0.2
    # via sphinx
 sphinxcontrib-htmlhelp==2.0.1
@@ -244,40 +158,15 @@ sphinxcontrib-qthelp==1.0.3
    # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
-sqlalchemy==1.4.46
-    # via jupyter-cache
-stack-data==0.6.2
-    # via ipython
-tabulate==0.9.0
-    # via jupyter-cache
-tornado==6.2
-    # via
-    #   ipykernel
-    #   jupyter-client
-traitlets==5.9.0
-    # via
-    #   comm
-    #   ipykernel
-    #   ipython
-    #   jupyter-client
-    #   jupyter-core
-    #   matplotlib-inline
-    #   nbclient
-    #   nbformat
 typing-extensions==4.5.0
-    # via
+    # via pydata-sphinx-theme
-    #   myst-nb
-    #   myst-parser
 uc-micro-py==1.0.1
    # via linkify-it-py
 urllib3==1.26.15
    # via requests
-wcwidth==0.2.6
-    # via prompt-toolkit
 wrapt==1.15.0
    # via deprecated
 zipp==3.15.0
-    # via importlib-metadata
+    # via
+    #   importlib-metadata
-# The following packages are considered to be unsafe in a requirements file:
+    #   importlib-resources
-# setuptools
--- a/example/02_gemm_bilinear/CMakeLists.txt
+++ b/example/02_gemm_bilinear/CMakeLists.txt
-add_example_executable(example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp)
 if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
    add_example_executable(example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp)
 endif()
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+    add_example_executable(example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp)
+endif()
--- a/example/03_gemm_bias_relu/CMakeLists.txt
+++ b/example/03_gemm_bias_relu/CMakeLists.txt
-add_example_executable(example_gemm_bias_relu_xdl_fp16 gemm_bias_relu_xdl_fp16.cpp)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+    add_example_executable(example_gemm_bias_relu_xdl_fp16 gemm_bias_relu_xdl_fp16.cpp)
+endif()
\ No newline at end of file
--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
-add_custom_target(example_gemm_add_add_fastgelu_xdl)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
+    add_custom_target(example_gemm_add_add_fastgelu_xdl)
-add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
+    add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
-add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
+    add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
-add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
+    add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
-if(USE_BITINT_EXTENSION_INT4)
+    if(USE_BITINT_EXTENSION_INT4)
-  add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
+       add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
-endif(USE_BITINT_EXTENSION_INT4)
+    endif(USE_BITINT_EXTENSION_INT4)
-add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
+    add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
-add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
+    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
-add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
+    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
-add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
+    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
-if(USE_BITINT_EXTENSION_INT4)
+    if(USE_BITINT_EXTENSION_INT4)
-  add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
+        add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
-endif(USE_BITINT_EXTENSION_INT4)
+    endif(USE_BITINT_EXTENSION_INT4)
-add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
+    add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
+endif()
\ No newline at end of file
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
-add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
+if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
-add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
+    add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
-add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
+    add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
-add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
+    add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
-# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
+    add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
-add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
+    # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
+    add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
+endif()
 add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
 add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
 add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)