Merge remote-tracking branch 'origin/develop' into ck_tile/moe_quant

7977f89d · carlushuang · 45131629 · 3599418a · 7977f89d · 7977f89d
Commit 7977f89d authored Nov 07, 2024 by carlushuang
17 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,20 +145,20 @@ message("hip_version_flat=${hip_VERSION_FLAT}")
 message("checking which targets are supported")
 #In order to build just the CK library (without tests and examples) for all supported GPU targets
-#use -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" 
+#use -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 #the GPU_TARGETS flag will be reset in this case in order to avoid conflicts.
 #
 #In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures.
 if(NOT ENABLE_ASAN_PACKAGING)
    if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000)
        # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above
-        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
    else()
-        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
+        set(CK_GPU_TARGETS "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
    endif()
 else()
    #build CK only for xnack-supported targets when using ASAN
-    set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+")
+    set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+")
 endif()
 #if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1101,11 +1101,11 @@ pipeline {
                    agent{ label rocmnode("gfx90a") }
                    environment{
                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install \
-                                         -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
+                                         -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
                                         -DCMAKE_CXX_FLAGS=" -O3 " """
                        execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                           cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-                                           -DGPU_TARGETS="gfx908;gfx90a;gfx940;gfx941;gfx942" \
+                                           -DGPU_TARGETS="gfx908;gfx90a;gfx942" \
                                           -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                    }
@@ -1165,7 +1165,7 @@ pipeline {
                        execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
                                           -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                           -D CMAKE_BUILD_TYPE=Release \
-                                           -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"  \
+                                           -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"  \
                                           -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
                    }
                    steps{

--- a/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+++ b/example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
@@ -68,7 +68,7 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
 using DeviceReduceInstance =
    ck::tensor_operation::device::DeviceReduceMultiBlock<OutputDataType,
-                                                         OutputDataType,
+                                                         ScaleDataType,
                                                         OutputDataType,
                                                         NumDim,
                                                         NumDim,
@@ -108,7 +108,8 @@ void reference_scale_permute_amax(Tensor<InputDataType>& input,
            host_output_scaled_casted_transposed(m, k) = y1;
            const OutputDataType y_fabs =
                ck::type_convert<OutputDataType>(ck::math::abs(ck::type_convert<float>(y0)));
-            host_output_amax(0) = ck::math::max(y_fabs, host_output_amax(0));
+            host_output_amax(0) = ck::type_convert<OutputDataType>(ck::math::max(
+                ck::type_convert<float>(y_fabs), ck::type_convert<float>(host_output_amax(0))));
        }
    }
 }

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -85,9 +85,9 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
    #only continue if there are some source files left on the list
    if(FILE_NAME)
        if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(FILE_NAME MATCHES "_wmma")
-            list(REMOVE_ITEM EX_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        endif()
        set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
        add_executable(${EXAMPLE_NAME} ${FILE_NAME})
@@ -169,9 +169,9 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    #only continue if there are some source files left on the list
    if(FILE_NAME)
        if(FILE_NAME MATCHES "_xdl")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(FILE_NAME MATCHES "_wmma")
-            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+            list(REMOVE_ITEM EX_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        endif()
        set_source_files_properties(${FILE_NAME} PROPERTIES LANGUAGE HIP)
        add_executable(${EXAMPLE_NAME} ${FILE_NAME})

--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -47,6 +47,9 @@ def list_blobs(output_file : Optional[str], api_list : List[str], kernel_filter
    assert output_file is not None
    file_path = Path(output_file)
+    # create an empty file / drop its contents if it exists
+    open(file_path, "w").close()
    for api in api_list:
        handler = handlers[api][HandlerId.LIST_BLOBS]
        handler(file_path, kernel_filter, receipt, mask_impl)

--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -559,7 +559,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
        w_p = Path(self.working_path)
        list_p = w_p / 'layernorm2d_fwd_blobs.txt'
        blobs = self.get_blobs()
-        with list_p.open('a') as list_f:
+        with list_p.open('w') as list_f:
            # api related file
            list_f.write(str(w_p / (self.name_api + ".cpp"))  + "\n")
            list_f.write(str(w_p / (self.name_common_header + ".hpp"))  + "\n")

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -93,12 +93,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -60,12 +60,12 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -117,12 +117,12 @@ __global__ void
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    // Pass two lds pointer is the key to tell compiler that ds_read/write
    // operate on different lds chunk at same time without order dependecy

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -98,12 +98,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t c_batch_offset =
+    const long_index_t c_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -60,12 +60,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
@@ -155,12 +155,12 @@ __global__ void
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
-    const long_index_t a_batch_offset =
+    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset =
+    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset =
+    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);

--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -121,10 +121,10 @@ struct GridwiseTensorRearrange
            __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
        // Global Memory
-        const index_t a_batch_offset =
+        const index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
-            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-        const index_t c_batch_offset =
+        const index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
-            __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
+            static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
        const auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize());

--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -24,7 +24,7 @@ namespace ck {
 namespace utils {
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
-double get_relative_threshold(const int numberOfAccumulations = 1)
+double get_relative_threshold(const int number_of_accumulations = 1)
 {
    using F8   = ck::f8_t;
    using F16  = ck::half_t;
@@ -79,13 +79,13 @@ double get_relative_threshold(const int numberOfAccumulations = 1)
    }
    else
    {
-        acc_error = std::pow(2, -NumericUtils<AccDataType>::mant) * 0.5 * numberOfAccumulations;
+        acc_error = std::pow(2, -NumericUtils<AccDataType>::mant) * 0.5 * number_of_accumulations;
    }
    return std::max(acc_error, midway_error);
 }
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
-double get_absolute_threshold(const double max_possible_num, const int numberOfAccumulations = 1)
+double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
 {
    using F8   = ck::f8_t;
    using F16  = ck::half_t;
@@ -142,7 +142,7 @@ double get_absolute_threshold(const double max_possible_num, const int numberOfA
    else
    {
        acc_error =
-            std::pow(2, expo - NumericUtils<AccDataType>::mant) * 0.5 * numberOfAccumulations;
+            std::pow(2, expo - NumericUtils<AccDataType>::mant) * 0.5 * number_of_accumulations;
    }
    return std::max(acc_error, midway_error);
 }

--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -88,19 +88,19 @@ function(add_instance_library INSTANCE_NAME)
        foreach(source IN LISTS ARGN)
            set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
            if(source MATCHES "_xdl")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
            elseif(source MATCHES "_wmma")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
            elseif(source MATCHES "mha")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
            endif()
            #only build the fp8 gemm instances for gfx908/90a if the build argument is set
            if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
                if(source MATCHES "gemm_xdl_universal" AND source MATCHES "f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
                endif()
                if(source MATCHES "gemm_multiply_multiply_f8")
-                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+                    list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
                endif()
            endif()
            set(offload_targets)

--- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
@@ -27,11 +27,6 @@ rocm_install(FILES ${MHA_HEADERS} DESTINATION include/ck_tile/ops)
 # headers for building lib
 file(COPY ${MHA_HEADERS} DESTINATION ${FMHA_CPP_FOLDER})
-# Delete the blob file if it exists to avoid append of old content.
-if(EXISTS ${FMHA_CPP_FOLDER}/blob_list.txt)
-    file(REMOVE ${FMHA_CPP_FOLDER}/blob_list.txt)
-endif()
 set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd")
 # generate a list of kernels, but not actually emit files at config stage

--- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp
@@ -240,6 +240,19 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
        {
            out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data());
+            auto number_of_accumulations = 1;
+            static_assert(
+                ReduceOpId == ck::ReduceTensorOp::AVG || ReduceOpId == ck::ReduceTensorOp::MAX,
+                "Warning: Unhandled ReduceOpId for setting up the number of accumulations!");
+            if constexpr(ReduceOpId == ck::ReduceTensorOp::AVG)
+            {
+                for(size_t i = 0; i < kernel_params.window_spatial_lengths.size(); ++i)
+                {
+                    number_of_accumulations *= kernel_params.window_spatial_lengths.at(i);
+                }
+            }
            auto absolute_error_threshold = 1.0;
            switch(in_params.init_method)
            {
@@ -250,9 +263,10 @@ bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams&
            absolute_error_threshold =
                ck::utils::get_absolute_threshold<ComputeDataType, OutDataType>(
-                    absolute_error_threshold);
+                    absolute_error_threshold, number_of_accumulations);
            auto relative_error_threshold =
-                ck::utils::get_relative_threshold<ComputeDataType, OutDataType>();
+                ck::utils::get_relative_threshold<ComputeDataType, OutDataType>(
+                    number_of_accumulations);
            bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData,
                                             out_n_c_do_ho_wo_host.mData,

--- a/profiler/src/profile_layernorm_fwd.cpp
+++ b/profiler/src/profile_layernorm_fwd.cpp
@@ -85,7 +85,7 @@ int profile_layernorm(int argc, char* argv[])
        if(data_type == ck::DataTypeEnum::Half)
        {
-            ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F32, false, rank>(
+            ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, F16, false, rank>(
                do_verification, init_method, do_log, time_kernel, length);
        }
        else if(data_type == ck::DataTypeEnum::Float)

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -64,11 +64,11 @@ function(add_test_executable TEST_NAME)
    #only continue if there are some source files left on the list
    if(ARGN)
        if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(ARGN MATCHES "_wmma")
-             list(REMOVE_ITEM TEST_TARGETS gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})
@@ -141,11 +141,11 @@ function(add_gtest_executable TEST_NAME)
    #only continue if there are some source files left on the list
    if(ARGN)
        if(ARGN MATCHES "_xdl")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201)
        elseif(ARGN MATCHES "_wmma")
-             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
        elseif(ARGN MATCHES "_smfmac")
-             list(REMOVE_ITEM TEST_TARGETS gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
+             list(REMOVE_ITEM TEST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx908 gfx90a gfx1200 gfx1201)
        endif()
        set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
        add_executable(${TEST_NAME} ${ARGN})