Merge branch 'transpose_opt' of https://github.com/ROCm/composable_kernel into rowwise_opt

4885c38a · aska-0096 · cbf14ee1 · 7c8e92fa · 4885c38a · 4885c38a
Commit 4885c38a authored Sep 03, 2024 by aska-0096
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -553,12 +553,7 @@ if(NOT DEFINED INSTANCES_ONLY)
        PACKAGE_NAME examples
   )
   add_subdirectory(example)
-   if(GPU_TARGETS MATCHES "gfx9" AND NOT INSTANCES_ONLY)
-      add_subdirectory(codegen)
-   endif()
-   if(BUILD_TESTING)
   add_subdirectory(test)
-   endif()

   rocm_package_setup_component(profiler
        LIBRARY_NAME composablekernel
@@ -575,6 +570,10 @@ if(NOT DEFINED INSTANCES_ONLY)
  endif()
 endif()

+if(NOT DEFINED PROFILER_ONLY AND (GPU_TARGETS MATCHES "gfx9" OR DEFINED INSTANCES_ONLY))
+  add_subdirectory(codegen)
+endif()
+
 #Create an interface target for the include only files and call it "composablekernels"
 include(CMakePackageConfigHelpers)


--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -262,10 +262,19 @@ def cmake_build(Map conf=[:]){
    // reduce parallelism when compiling, clang uses too much memory
    def nt = nthreads()
    def cmd
+    def setup_cmd
+    def build_cmd
    def execute_cmd = conf.get("execute_cmd", "")
    if(!setup_args.contains("NO_CK_BUILD")){
-        def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
-        def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j${nt} ${config_targets}")
+        if (setup_args.contains("gfx90a") && params.NINJA_BUILD_TRACE){
+            echo "running ninja build trace"
+            setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake -G Ninja ${setup_args}   .. ")
+            build_cmd = conf.get("build_cmd", "${build_envs} ninja -j${nt} ${config_targets}")
+        }
+        else{
+            setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
+            build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j${nt} ${config_targets}")
+        }
        cmd = conf.get("cmd", """
            ${setup_cmd}
            ${build_cmd}
@@ -281,7 +290,19 @@ def cmake_build(Map conf=[:]){
    echo cmd

    dir("build"){
+        //build CK
        sh cmd
+        //run tests
+        if(!setup_args.contains("NO_CK_BUILD")){
+            if (setup_args.contains("gfx90a") && params.NINJA_BUILD_TRACE){
+                sh "/ninjatracing/ninjatracing .ninja_log > ck_build_trace.json"
+                archiveArtifacts "ck_build_trace.json"
+                sh "ninja test"
+            }
+            else{
+                sh "make check"
+            }
+        }
    }

    // Only archive from master or develop
@@ -543,7 +564,7 @@ def Build_CK(Map conf=[:]){
                    cmake_build(conf)
                    dir("build"){
                        //run tests and examples
-                        sh 'make -j check'
+                        //sh 'make -j check'
                        if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){
                            //we only need the ckProfiler to run the performance tests, so we pack and stash it
                            //do not stash profiler on nodes where we don't need to run performance tests
@@ -684,8 +705,8 @@ def process_results(Map conf=[:]){
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true
                                              0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
-                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false
-                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false
+                                              0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
+                                              0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
                                              0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false''' : ""

 pipeline {
@@ -765,7 +786,10 @@ pipeline {
            name: "BUILD_GFX12",
            defaultValue: false,
            description: "Build CK and run tests on gfx12 (default: OFF)")
-
+        booleanParam(
+            name: "NINJA_BUILD_TRACE",
+            defaultValue: false,
+            description: "Generate a ninja build trace (default: OFF)")
    }
    environment{
        dbuser = "${dbuser}"
@@ -799,6 +823,7 @@ pipeline {
                    }
                    agent{ label rocmnode("nogpu") }
                    environment{
+                        setup_args = "NO_CK_BUILD"
                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
                                -o -not -path \'*.git*\' -iname \'*.hpp\' \
                                -o -not -path \'*.git*\' -iname \'*.cpp\' \
@@ -815,7 +840,7 @@ pipeline {
                                --file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log"
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+                        buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
                        archiveArtifacts "build/ck_cppcheck.log"
                        cleanWs()
                    }
@@ -827,6 +852,7 @@ pipeline {
                    }
                    agent{ label rocmnode("nogpu") }
                    environment{
+                        setup_args = "NO_CK_BUILD"
                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
                                -o -not -path \'*.git*\' -iname \'*.hpp\' \
                                -o -not -path \'*.git*\' -iname \'*.cpp\' \
@@ -838,7 +864,7 @@ pipeline {
                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\'"
                    }
                    steps{
-                        buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+                        buildHipClangJobAndReboot(setup_args:setup_args, setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
                        cleanWs()
                    }
                }
@@ -967,10 +993,10 @@ pipeline {
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1100;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx1100;gfx90a" \
+                                           -DGPU_TARGETS="gfx90a" \
                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                    }
@@ -1074,7 +1100,7 @@ pipeline {
                    options { retry(1) }
                    agent{ label rocmnode("gfx90a")}
                    environment{
-                        setup_args = """ -DGPU_TARGETS="gfx90a" -DBUILD_DEV=On """
+                        setup_args = "NO_CK_BUILD"
                    }
                    steps{
                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')

--- a/client_example/24_grouped_conv_activation/CMakeLists.txt
+++ b/client_example/24_grouped_conv_activation/CMakeLists.txt
@@ -47,6 +47,22 @@ target_link_libraries(client_conv3d_fwd_convscale_add_fp8 PRIVATE composable_ker
 add_executable(client_conv3d_fwd_convscale_relu_fp8
               grouped_convnd_fwd_convscale_relu/conv3d_fwd_convscale_relu_fp8.cpp)
 target_link_libraries(client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_kernel::device_conv_operations)
+# Fwd convscale + ReLU + AMAX
+add_executable(client_conv3d_fwd_convscale_relu_amax_fp8
+               grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp)
+target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
+                      PRIVATE composable_kernel::device_conv_operations
+                              composable_kernel::device_other_operations
+                              composable_kernel::device_reduction_operations
+                              utility)
+# Fwd convscale + AMAX
+add_executable(client_conv3d_fwd_convscale_amax_fp8
+               grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp)
+target_link_libraries(client_conv3d_fwd_convscale_amax_fp8
+                      PRIVATE composable_kernel::device_conv_operations
+                              composable_kernel::device_other_operations
+                              composable_kernel::device_reduction_operations
+                              utility)
 # Fwd convscale
 add_executable(client_conv3d_fwd_convscale_fp8
               grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp)

--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using CShuffleDataType = float;
+using ConvOutDataType  = float;    // data type of convolution result
+using OutDataType      = ck::f8_t; // data type of final result
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+
+using ConvElementOp = ConvScale;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+
+constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+
+int main()
+{
+    return run_grouped_conv_fwd_convscale_reduce<NumDimSpatial,
+                                                 InDataType,
+                                                 WeiDataType,
+                                                 ConvOutDataType,
+                                                 OutDataType,
+                                                 ConvElementOp,
+                                                 ReduceOpId,
+                                                 InLayout,
+                                                 WeiLayout,
+                                                 OutLayout,
+                                                 3,
+                                                 AComputeDataType,
+                                                 BComputeDataType>(
+               {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using CShuffleDataType = float;
+using ConvOutDataType  = float;    // data type of convolution result
+using OutDataType      = ck::f8_t; // data type of final result
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+
+using ConvElementOp = ConvScaleRelu;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+
+constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX;
+
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+
+int main()
+{
+    return run_grouped_conv_fwd_convscale_reduce<NumDimSpatial,
+                                                 InDataType,
+                                                 WeiDataType,
+                                                 ConvOutDataType,
+                                                 OutDataType,
+                                                 ConvElementOp,
+                                                 ReduceOpId,
+                                                 InLayout,
+                                                 WeiLayout,
+                                                 OutLayout,
+                                                 3,
+                                                 AComputeDataType,
+                                                 BComputeDataType>(
+               {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -27,6 +27,8 @@ file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
 add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)

 file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
+
+##message(STATUS "SOURCE_FILES: ${SOURCES}")
 # TODO: Use object library
 add_library(ck_host STATIC ${SOURCES})
 target_link_libraries(ck_host PRIVATE ck_headers)
@@ -48,6 +50,4 @@ rocm_install(
 )
 rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

-if(BUILD_TESTING)
 add_subdirectory(test)
-endif()
--- a/codegen/test/CMakeLists.txt
+++ b/codegen/test/CMakeLists.txt
 list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
 add_subdirectory(rtc)
 file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
-foreach(TEST_SRC ${TEST_SRCS})
+if(NOT INSTANCES_ONLY)
+  foreach(TEST_SRC ${TEST_SRCS})
    set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
    get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
-  add_executable(test_host_${BASE_NAME} ${TEST_SRC})
-  add_dependencies(codegen test_host_${BASE_NAME})
-  add_test(NAME codegen_test_${BASE_NAME} COMMAND test_host_${BASE_NAME})
-  target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host)
-  # target_link_libraries(test_host_${BASE_NAME} ${CK_ROOT}/build/lib/libutility.a)
-  target_include_directories(test_host_${BASE_NAME} PUBLIC include())
-  target_include_directories(test_host_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
-  target_include_directories(test_host_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
-endforeach()
+    add_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
+    add_dependencies(codegen codegen_test_${BASE_NAME})
+    add_dependencies(tests codegen_test_${BASE_NAME})
+    add_dependencies(check codegen_test_${BASE_NAME})
+    add_test(NAME codegen_test_${BASE_NAME} COMMAND codegen_test_${BASE_NAME})
+    message("adding test codegen_test_${BASE_NAME}")
+    target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
+    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/codegen/test/include)
+    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
+    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
+  endforeach()
+endif()
--- a/codegen/test/rtc/CMakeLists.txt
+++ b/codegen/test/rtc/CMakeLists.txt
-
-find_package(hip)
 file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
 add_library(ck_rtc ${RTC_SOURCES})
 target_include_directories(ck_rtc PUBLIC include)

--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.7.0
+rocm-docs-core==1.7.2
 sphinxcontrib-bibtex==2.6.2
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.7.0
+rocm-docs-core==1.7.2
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -34,6 +34,7 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8_v3)
 add_example_executable(example_gemm_xdl_bf16_v3 gemm_xdl_bf16_v3.cpp)
 target_compile_options(example_gemm_xdl_bf16_v3 PRIVATE -mllvm -greedy-reverse-local-assignment=1 -save-temps=$PWD -Wno-gnu-line-marker)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_v3)
+target_compile_options(example_gemm_xdl_bf16_v3 PRIVATE -mllvm -greedy-reverse-local-assignment=1 -save-temps=$PWD -Wno-gnu-line-marker)

 add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)

--- a/example/01_gemm/gemm_xdl_bf16_v3.cpp
+++ b/example/01_gemm/gemm_xdl_bf16_v3.cpp
@@ -12,7 +12,7 @@ using CShuffleDataType = ck::bhalf_t;
 using CDataType        = ck::bhalf_t;

 using ALayout = Row;
-using BLayout = Col;
+using BLayout = Row;
 using CLayout = Row;

 using AElementOp = PassThrough;
@@ -28,15 +28,15 @@ using DeviceGemmV2Instance =
        ADataType,   BDataType,  CDataType,  AccDataType,  CShuffleDataType, 
        PassThrough, PassThrough, PassThrough, GemmDefault, 
        256,
-        256, 256, 
-        32, 8, 8,
-        32,   32,
-        4,    4,
-        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
+        224, 256, 
+        64, 8, 1,
+        16,   16,
+        7,    8,
+        S<8, 32, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
        2, 8, 8, 0,
-        S<4, 64, 1>,  S<1, 0, 2>,  S<1, 0, 2>, 
-        2, 8, 8, 0,
-        1, 1, S<1, 32, 1, 8>, 8,
+        S<8, 32, 1>,  S<0, 2, 1>,  S<0, 2, 1>, 
+        1, 8, 8, 1,
+        1, 2, S<1, 32, 1, 8>, 8,
        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
 // clang-format on


--- a/example/01_gemm/gemm_xdl_fp8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "common.hpp"

@@ -7,7 +7,7 @@

 using ADataType        = ck::f8_t;
 using BDataType        = ck::f8_t;
-using CDataType        = ck::half_t;
+using CDataType        = ck::f8_t;
 using AccDataType      = float;
 using CShuffleDataType = float;


--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -34,11 +34,11 @@ inline __host__ __device__ constexpr double get_rtol()
    }
    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
    {
-        return 1e-1; // 240 and 224 are acceptable
+        return 2e-1;
    }
    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
    {
-        return 1.5e-1; // 57344 and 49152 are acceptable
+        return 2e-1;
    }
    else
    {
@@ -75,11 +75,11 @@ inline __host__ __device__ constexpr double get_atol()
    }
    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
    {
-        return 16.1; // 240 and 224 are acceptable
+        return 2e-1;
    }
    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
    {
-        return 8192.1; // 57344 and 49152 are acceptable
+        return 2e-1;
    }
    else
    {

--- a/example/62_convnd_activ/CMakeLists.txt
+++ b/example/62_convnd_activ/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(convinvscale)
 add_subdirectory(convscale)
 add_subdirectory(convscale_relu)
 add_subdirectory(convscale_add)
+add_subdirectory(convscale_reduce)
 add_subdirectory(multi_AB)
 add_subdirectory(unary)


--- a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+     add_custom_target(example_convnd_activ_xdl_convscale_reduce)
+     add_example_executable(example_convnd_fwd_xdl_convscale_relu_amax_fp8 convnd_fwd_xdl_convscale_relu_amax_fp8.cpp)
+     add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8)
+
+     add_example_executable(example_convnd_fwd_xdl_convscale_amax_fp8 convnd_fwd_xdl_convscale_amax_fp8.cpp)
+     add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_amax_fp8)
+
+     set(target 1)
+ endif()
+endforeach()
--- a/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
+++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
--- a/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
+++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_convscale_reduce_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using ConvOutDataType  = float;    // data type of convolution result
+using OutDataType      = ck::f8_t; // data type of final result
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScale;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        ConvOutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8,
+        AComputeDataType,
+        BComputeDataType>;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
--- a/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
+++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_convscale_reduce_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using ConvOutDataType  = float;    // data type of convolution result
+using OutDataType      = ck::f8_t; // data type of final result
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScaleRelu;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        ConvOutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8,
+        AComputeDataType,
+        BComputeDataType>;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }