Merge branch 'develop' into amd-develop

ea5be216 · Jun Liu · e2eb0418 · 25935b57 · ea5be216 · ea5be216
Commit ea5be216 authored Aug 23, 2024 by Jun Liu
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,8 +62,14 @@ if (DTYPES)
    endif()
    message("DTYPES macro set to ${DTYPES}")
 else()
-    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
+    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8)
-    set(CK_ENABLE_ALL_DTYPES "ON")
+    set(CK_ENABLE_INT8 "ON")
+    set(CK_ENABLE_FP16 "ON")
+    set(CK_ENABLE_FP32 "ON")
+    set(CK_ENABLE_FP64 "ON")
+    set(CK_ENABLE_BF16 "ON")
+    set(CK_ENABLE_FP8 "ON")
+    set(CK_ENABLE_BF8 "ON")
 endif()
 #for f8/bf8_t type
@@ -182,12 +188,18 @@ endif()
 configure_file(include/ck/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ck/config.h)
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
-   message("Adding the fno-offload-uniform-block compiler flag")
+  check_cxx_compiler_flag("-fno-offload-uniform-block" HAS_NO_OFFLOAD_UNIFORM_BLOCK)
-   add_compile_options(-fno-offload-uniform-block)
+  if(HAS_NO_OFFLOAD_UNIFORM_BLOCK)
+    message("Adding the fno-offload-uniform-block compiler flag")
+    add_compile_options(-fno-offload-uniform-block)
+  endif()
 endif()
 if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
-   message("Adding the enable-post-misched=0 compiler flag")
+  check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
-   add_compile_options("SHELL: -mllvm -enable-post-misched=0")
+  if(HAS_ENABLE_POST_MISCHED)
+    message("Adding the enable-post-misched=0 compiler flag")
+    add_compile_options("SHELL: -mllvm -enable-post-misched=0")
+  endif()
 endif()
 set(check-coerce)
 check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
@@ -541,12 +553,7 @@ if(NOT DEFINED INSTANCES_ONLY)
        PACKAGE_NAME examples
   )
   add_subdirectory(example)
-   if(GPU_TARGETS MATCHES "gfx9" AND NOT INSTANCES_ONLY)
+   add_subdirectory(test)
-      add_subdirectory(codegen)
-   endif()
-   if(BUILD_TESTING)
-      add_subdirectory(test)
-   endif()
   rocm_package_setup_component(profiler
        LIBRARY_NAME composablekernel
@@ -563,6 +570,10 @@ if(NOT DEFINED INSTANCES_ONLY)
  endif()
 endif()
+if(NOT DEFINED PROFILER_ONLY AND (GPU_TARGETS MATCHES "gfx9" OR DEFINED INSTANCES_ONLY))
+  add_subdirectory(codegen)
+endif()
 #Create an interface target for the include only files and call it "composablekernels"
 include(CMakePackageConfigHelpers)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -426,8 +426,9 @@ def runCKProfiler(Map conf=[:]){
                            archiveArtifacts "perf_resnet50_N4.log"
                            archiveArtifacts "perf_batched_gemm.log"
                            archiveArtifacts "perf_grouped_gemm.log"
-                            archiveArtifacts "perf_conv_fwd.log"
+                            archiveArtifacts "perf_grouped_conv_fwd.log"
-                            archiveArtifacts "perf_conv_bwd_data.log"
+                            archiveArtifacts "perf_grouped_conv_bwd_data.log"
+                            archiveArtifacts "perf_grouped_conv_bwd_weight.log"
                            archiveArtifacts "perf_gemm_bilinear.log"
                            archiveArtifacts "perf_reduction.log"
                            archiveArtifacts "perf_splitK_gemm.log"
@@ -439,8 +440,9 @@ def runCKProfiler(Map conf=[:]){
                            stash name: "perf_resnet50_N4.log"
                            stash name: "perf_batched_gemm.log"
                            stash name: "perf_grouped_gemm.log"
-                            stash name: "perf_conv_fwd.log"
+                            stash name: "perf_grouped_conv_fwd.log"
-                            stash name: "perf_conv_bwd_data.log"
+                            stash name: "perf_grouped_conv_bwd_data.log"
+                            stash name: "perf_grouped_conv_bwd_weight.log"
                            stash name: "perf_gemm_bilinear.log"
                            stash name: "perf_reduction.log"
                            stash name: "perf_splitK_gemm.log"
@@ -648,8 +650,9 @@ def process_results(Map conf=[:]){
                        unstash "perf_resnet50_N4.log"
                        unstash "perf_batched_gemm.log"
                        unstash "perf_grouped_gemm.log"
-                        unstash "perf_conv_fwd.log"
+                        unstash "perf_grouped_conv_fwd.log"
-                        unstash "perf_conv_bwd_data.log"
+                        unstash "perf_grouped_conv_bwd_data.log"
+                        unstash "perf_grouped_conv_bwd_weight.log"
                        unstash "perf_gemm_bilinear.log"
                        unstash "perf_reduction.log"
                        unstash "perf_splitK_gemm.log"
@@ -746,6 +749,10 @@ pipeline {
            name: "RUN_PERFORMANCE_TESTS",
            defaultValue: true,
            description: "Run the performance tests (default: ON)")
+        booleanParam(
+            name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
+            defaultValue: false,
+            description: "Run the grouped conv large cases tests (default: OFF)")
        booleanParam(
            name: "RUN_CK_TILE_TESTS",
            defaultValue: false,
@@ -837,6 +844,30 @@ pipeline {
                }
            }
        }
+        stage("Run Grouped Conv Large Case Tests")
+        {
+            parallel
+            {
+                stage("Run Grouped Conv Large Case Tests on gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_GROUPED_CONV_LARGE_CASES_TESTS.toBoolean() }
+                    }
+                    agent{ label rocmnode("gfx90a")}
+                    environment{
+                        setup_args = "NO_CK_BUILD"
+                        execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx90a && \
+                                           make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
+                                           ./bin/test_grouped_convnd_fwd_large_cases_xdl"""
+                   }
+                    steps{
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        cleanWs()
+                    }
+                }
+            }
+        }
        stage("Run CK_TILE Tests")
        {
            parallel

--- a/client_example/07_grouped_convnd_fwd/CMakeLists.txt
+++ b/client_example/07_grouped_convnd_fwd/CMakeLists.txt
@@ -5,17 +5,17 @@ if(GPU_TARGETS MATCHES "gfx9")
    add_executable(client_grouped_conv1d_fwd grouped_conv1d_fwd.cpp)
    target_link_libraries(client_grouped_conv1d_fwd PRIVATE composable_kernel::device_conv_operations)
-    if((DTYPES MATCHES "fp8") OR NOT DEFINED DTYPES)
+    if((DTYPES MATCHES "fp8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
        add_executable(client_grouped_conv3d_fwd_fp8 grouped_conv3d_fwd_fp8.cpp)
        target_link_libraries(client_grouped_conv3d_fwd_fp8 PRIVATE composable_kernel::device_conv_operations)
    endif()
-    if((DTYPES MATCHES "bf8") OR NOT DEFINED DTYPES)
+    if((DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
        add_executable(client_grouped_conv3d_fwd_bf8 grouped_conv3d_fwd_bf8.cpp)
        target_link_libraries(client_grouped_conv3d_fwd_bf8 PRIVATE composable_kernel::device_conv_operations)
    endif()
-    if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR NOT DEFINED DTYPES)
+    if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
        add_executable(client_grouped_conv3d_fwd_fp8_bf8 grouped_conv3d_fwd_fp8_bf8.cpp)
        target_link_libraries(client_grouped_conv3d_fwd_fp8_bf8 PRIVATE composable_kernel::device_conv_operations)

--- a/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
+++ b/client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
@@ -4,5 +4,7 @@ target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::
 add_executable(client_grouped_conv3d_bwd_data grouped_conv3d_bwd_data.cpp)
 target_link_libraries(client_grouped_conv3d_bwd_data PRIVATE composable_kernel::device_conv_operations)
-add_executable(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp)
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
-target_link_libraries(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 PRIVATE composable_kernel::device_conv_operations)
+    add_executable(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 grouped_conv3d_bwd_data_input_fp16_comp_bf8f8.cpp)
+    target_link_libraries(client_grouped_conv3d_bwd_data_input_fp16_comp_bf8f8 PRIVATE composable_kernel::device_conv_operations)
+endif()
\ No newline at end of file
--- a/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
@@ -2,10 +2,13 @@ add_executable(client_grouped_conv1d_bwd_weight_fp16 grouped_conv1d_bwd_weight_f
 add_executable(client_grouped_conv2d_bwd_weight_fp16 grouped_conv2d_bwd_weight_fp16.cpp)
 add_executable(client_grouped_conv3d_bwd_weight_fp16 grouped_conv3d_bwd_weight_fp16.cpp)
 add_executable(client_grouped_conv3d_bwd_weight_fp32 grouped_conv3d_bwd_weight_fp32.cpp)
-add_executable(client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp)
 target_link_libraries(client_grouped_conv1d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations)
 target_link_libraries(client_grouped_conv2d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations)
 target_link_libraries(client_grouped_conv3d_bwd_weight_fp16 PRIVATE composable_kernel::device_conv_operations)
 target_link_libraries(client_grouped_conv3d_bwd_weight_fp32 PRIVATE composable_kernel::device_conv_operations)
-target_link_libraries(client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 PRIVATE composable_kernel::device_conv_operations)
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
+    add_executable(client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8.cpp)
+    target_link_libraries(client_grouped_conv3d_bwd_weight_fp16_comp_bf8_fp8 PRIVATE composable_kernel::device_conv_operations)
+endif()
\ No newline at end of file
--- a/client_example/16_convnd_fwd/CMakeLists.txt
+++ b/client_example/16_convnd_fwd/CMakeLists.txt
@@ -4,7 +4,7 @@ if((DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
 endif()
-if((DTYPES MATCHES "fp8") OR NOT DEFINED DTYPES)
+if((DTYPES MATCHES "fp8") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
    add_executable(client_conv3d_fwd_fp16_comp_fp8 conv3d_fwd_fp16_comp_fp8.cpp)
    target_link_libraries(client_conv3d_fwd_fp16_comp_fp8 PRIVATE composable_kernel::device_conv_operations)
 endif()

--- a/client_example/20_splitk_gemm/CMakeLists.txt
+++ b/client_example/20_splitk_gemm/CMakeLists.txt
-if(GPU_TARGETS MATCHES "gfx9" AND ((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES))
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR (NOT DEFINED DTYPES AND GPU_TARGETS MATCHES "gfx94"))
  add_executable(client_splitK_gemm splitK_gemm_fp16_f8.cpp)
  target_link_libraries(client_splitK_gemm PRIVATE composable_kernel::device_gemm_operations)
 endif()
--- a/client_example/24_grouped_conv_activation/CMakeLists.txt
+++ b/client_example/24_grouped_conv_activation/CMakeLists.txt
 if(GPU_TARGETS MATCHES "gfx9")
 # Fwd scaleadd scaleadd relu
-add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 
+add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
               grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp)
 target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations)
@@ -36,7 +36,7 @@ add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16
               grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp)
 target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
 # Fwd convinvscale
-add_executable(client_conv3d_fwd_convinvscale_fp8 
+add_executable(client_conv3d_fwd_convinvscale_fp8
               grouped_convnd_fwd_convinvscale/conv3d_fwd_convinvscale_fp8.cpp)
 target_link_libraries(client_conv3d_fwd_convinvscale_fp8 PRIVATE composable_kernel::device_conv_operations)
 # Fwd convscale + Bias
@@ -47,6 +47,22 @@ target_link_libraries(client_conv3d_fwd_convscale_add_fp8 PRIVATE composable_ker
 add_executable(client_conv3d_fwd_convscale_relu_fp8
               grouped_convnd_fwd_convscale_relu/conv3d_fwd_convscale_relu_fp8.cpp)
 target_link_libraries(client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_kernel::device_conv_operations)
+# Fwd convscale + ReLU + AMAX
+add_executable(client_conv3d_fwd_convscale_relu_amax_fp8
+               grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp)
+target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
+                      PRIVATE composable_kernel::device_conv_operations
+                              composable_kernel::device_other_operations
+                              composable_kernel::device_reduction_operations
+                              utility)
+# Fwd convscale + AMAX
+add_executable(client_conv3d_fwd_convscale_amax_fp8
+               grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp)
+target_link_libraries(client_conv3d_fwd_convscale_amax_fp8
+                      PRIVATE composable_kernel::device_conv_operations
+                              composable_kernel::device_other_operations
+                              composable_kernel::device_reduction_operations
+                              utility)
 # Fwd convscale
 add_executable(client_conv3d_fwd_convscale_fp8
               grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp)
@@ -56,11 +72,11 @@ add_executable(client_conv3d_fwd_convscale_bf8
               grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8.cpp)
 target_link_libraries(client_conv3d_fwd_convscale_bf8 PRIVATE composable_kernel::device_conv_operations)
-add_executable(client_conv3d_fwd_convscale_fp8_bf8 
+add_executable(client_conv3d_fwd_convscale_fp8_bf8
               grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8_bf8.cpp)
 target_link_libraries(client_conv3d_fwd_convscale_fp8_bf8 PRIVATE composable_kernel::device_conv_operations)
-add_executable(client_conv3d_fwd_convscale_bf8_fp8 
+add_executable(client_conv3d_fwd_convscale_bf8_fp8
               grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8_fp8.cpp)
 target_link_libraries(client_conv3d_fwd_convscale_bf8_fp8 PRIVATE composable_kernel::device_conv_operations)
 # Bwd data bilinear

--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using CShuffleDataType = float;
+using ConvOutDataType  = float;    // data type of convolution result
+using OutDataType      = ck::f8_t; // data type of final result
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+using ConvElementOp = ConvScale;
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX;
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+int main()
+{
+    return run_grouped_conv_fwd_convscale_reduce<NumDimSpatial,
+                                                 InDataType,
+                                                 WeiDataType,
+                                                 ConvOutDataType,
+                                                 OutDataType,
+                                                 ConvElementOp,
+                                                 ReduceOpId,
+                                                 InLayout,
+                                                 WeiLayout,
+                                                 OutLayout,
+                                                 3,
+                                                 AComputeDataType,
+                                                 BComputeDataType>(
+               {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp
+++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using CShuffleDataType = float;
+using ConvOutDataType  = float;    // data type of convolution result
+using OutDataType      = ck::f8_t; // data type of final result
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+using ConvElementOp = ConvScaleRelu;
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX;
+static constexpr ck::index_t NumDimSpatial = 3;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 64;
+static constexpr ck::index_t K             = 128;
+static constexpr ck::index_t C             = 64;
+static constexpr ck::index_t Z             = 3;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Di            = 28;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 3;
+static constexpr ck::index_t Do            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 3;
+int main()
+{
+    return run_grouped_conv_fwd_convscale_reduce<NumDimSpatial,
+                                                 InDataType,
+                                                 WeiDataType,
+                                                 ConvOutDataType,
+                                                 OutDataType,
+                                                 ConvElementOp,
+                                                 ReduceOpId,
+                                                 InLayout,
+                                                 WeiLayout,
+                                                 OutLayout,
+                                                 3,
+                                                 AComputeDataType,
+                                                 BComputeDataType>(
+               {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
+               ? EXIT_SUCCESS
+               : EXIT_FAILURE;
+}
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -34,8 +34,17 @@ if (DTYPES)
    endif()
    message("DTYPES macro set to ${DTYPES}")
 else()
-    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
+    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
-    set(CK_ENABLE_ALL_DTYPES "ON")
+    set(CK_ENABLE_INT8 "ON")
+    set(CK_ENABLE_FP16 "ON")
+    set(CK_ENABLE_FP32 "ON")
+    set(CK_ENABLE_FP64 "ON")
+    set(CK_ENABLE_BF16 "ON")
+    if (GPU_TARGETS MATCHES "gfx94")
+        add_definitions(-DCK_ENABLE_FP8 -DCK_ENABLE_BF8)
+        set(CK_ENABLE_FP8 "ON")
+        set(CK_ENABLE_BF8 "ON")
+    endif()
 endif()
 if (GPU_TARGETS)

--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -27,6 +27,8 @@ file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
 add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
 file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
+##message(STATUS "SOURCE_FILES: ${SOURCES}")
 # TODO: Use object library
 add_library(ck_host STATIC ${SOURCES})
 target_link_libraries(ck_host PRIVATE ck_headers)
@@ -48,6 +50,4 @@ rocm_install(
 )
 rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-if(BUILD_TESTING)
 add_subdirectory(test)
-endif()
--- a/codegen/test/CMakeLists.txt
+++ b/codegen/test/CMakeLists.txt
 list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
 add_subdirectory(rtc)
 file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
-foreach(TEST_SRC ${TEST_SRCS})
+if(NOT INSTANCES_ONLY)
-  set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
+  foreach(TEST_SRC ${TEST_SRCS})
-  get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
+    set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
-  add_executable(test_host_${BASE_NAME} ${TEST_SRC})
+    get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
-  add_dependencies(codegen test_host_${BASE_NAME})
+    add_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
-  add_test(NAME codegen_test_${BASE_NAME} COMMAND test_host_${BASE_NAME})
+    add_dependencies(codegen codegen_test_${BASE_NAME})
-  target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host)
+    add_dependencies(tests codegen_test_${BASE_NAME})
-  # target_link_libraries(test_host_${BASE_NAME} ${CK_ROOT}/build/lib/libutility.a)
+    add_dependencies(check codegen_test_${BASE_NAME})
-  target_include_directories(test_host_${BASE_NAME} PUBLIC include())
+    add_test(NAME codegen_test_${BASE_NAME} COMMAND codegen_test_${BASE_NAME})
-  target_include_directories(test_host_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
+    message("adding test codegen_test_${BASE_NAME}")
-  target_include_directories(test_host_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
+    target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
-endforeach()
+    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/codegen/test/include)
+    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
+    target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
+  endforeach()
+endif()
--- a/codegen/test/rtc/CMakeLists.txt
+++ b/codegen/test/rtc/CMakeLists.txt
-find_package(hip)
 file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
 add_library(ck_rtc ${RTC_SOURCES})
 target_include_directories(ck_rtc PUBLIC include)

--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.6.2
+rocm-docs-core==1.7.2
 sphinxcontrib-bibtex==2.6.2
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -103,7 +103,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.6.2
+rocm-docs-core==1.7.2
    # via -r requirements.in
 six==1.16.0
    # via pybtex

--- a/example/01_gemm/gemm_xdl_fp8.cpp
+++ b/example/01_gemm/gemm_xdl_fp8.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -7,7 +7,7 @@
 using ADataType        = ck::f8_t;
 using BDataType        = ck::f8_t;
-using CDataType        = ck::half_t;
+using CDataType        = ck::f8_t;
 using AccDataType      = float;
 using CShuffleDataType = float;

--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -34,11 +34,11 @@ inline __host__ __device__ constexpr double get_rtol()
    }
    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
    {
-        return 1e-1; // 240 and 224 are acceptable
+        return 2e-1;
    }
    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
    {
-        return 1.5e-1; // 57344 and 49152 are acceptable
+        return 2e-1;
    }
    else
    {
@@ -75,11 +75,11 @@ inline __host__ __device__ constexpr double get_atol()
    }
    else if constexpr(std::is_same_v<DataType, ck::f8_t>)
    {
-        return 16.1; // 240 and 224 are acceptable
+        return 2e-1;
    }
    else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
    {
-        return 8192.1; // 57344 and 49152 are acceptable
+        return 2e-1;
    }
    else
    {

--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <initializer_list>
@@ -255,34 +255,61 @@ int main(int argc, char* argv[])
    else
    {
        // for testing half_t
+        pass =
+            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass =
            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing float
+        pass =
+            pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing double
+        pass =
+            pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing bhalf_t
+        pass = pass &&
+               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                   true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass &&
               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
                   true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing int8_t
+        pass =
+            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass =
            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
        // for testing int4_t using AVG operation
+        pass =
+            pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
        // for testing int4_t using MAX operation
+        pass =
+            pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                        true, 2, true, {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, {0, 1, 2}, 1.0f, 0.0f);
        pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
 #endif