build on navi3x

4a93c836 · rocking · 47b3e10b · 4a93c836 · 4a93c836
Commit 4a93c836 authored Mar 16, 2023 by rocking
2 changed files
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -116,26 +116,26 @@ template <typename GridwiseGemm,
          bool HasDoubleTailKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_grouped_conv_fwd_dl_multiple_d(
+    kernel_grouped_conv_fwd_dl_multiple_d(
-            const ABDataType* __restrict__ p_a_grid,
+        const ABDataType* __restrict__ p_a_grid,
-            const ABDataType* __restrict__ p_b_grid,
+        const ABDataType* __restrict__ p_b_grid,
-            DsPointer p_ds_grid,
+        DsPointer p_ds_grid,
-            EDataType* __restrict__ p_e_grid,
+        EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
+        const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
+        const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
+        const CDEElementwiseOperation cde_element_op,
-            const index_t batch_count,
+        const index_t batch_count,
-            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+        const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
-            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+        const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
-            const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
+        const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
-            const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
+        const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
-            const Block2CTileMap block_2_ctile_map,
+        const Block2CTileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+        const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
-    defined(__gfx90a__) || defined(__gfx908__))
+    defined(__gfx1100__) || defined(__gfx90a__) || defined(__gfx908__))
    // offset base pointer for each work-group
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -380,8 +380,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
    }
    // desc for problem definition
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(MakeAGridDescriptor_AK0_M_AK1<ALayout>(
-        MakeAGridDescriptor_AK0_M_AK1<ALayout>({}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
+        {}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
    using BGridDesc_BK0_N_BK1 =
        remove_cvref_t<decltype(MakeBGridDescriptor_BK0_N_BK1<BLayout>({}, {}))>;
    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}))>;
@@ -709,8 +709,9 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
        namespace ctc = tensor_layout::convolution;
        // check device
-        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx90a" ||
-             ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908"))
+             ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx1030" ||
+             ck::get_device_name() == "gfx1100"))
        {
            return false;
        }

--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
-D GPU_TARGETS="gfx908;gfx90a"                                                                      \
+-D GPU_TARGETS="gfx1100"                                                                      \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}