compile ck for all targets

4e075420 · illsilin · 64687816 · 4e075420 · 4e075420 · 4e075420
Commit 4e075420 authored Apr 07, 2022 by illsilin
15 changed files
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ mkdir build && cd build
 cmake                                                                 \
 -D BUILD_DEV=OFF                                                      \
 -D CMAKE_BUILD_TYPE=Release                                           \
-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3  \
+-D CMAKE_CXX_FLAGS=" -O3  \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                             \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                        \
 ..

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -54,6 +54,7 @@ __global__ void
            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
            const Block2CTileMap block_2_ctile_map)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -933,6 +934,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi

        return str.str();
    }
+#endif //end of if defined (defined(__gfx908__) || defined(__gfx90a__))
 };

 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -46,6 +46,7 @@ __global__ void
            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
            const Block2CTileMap block_2_ctile_map)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -609,6 +610,7 @@ struct DeviceBatchedGemmXdl

        return str.str();
    }
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 };

 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -49,6 +49,7 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -654,6 +655,7 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_

        return str.str();
    }
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 };

 } // namespace device

--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -55,7 +55,7 @@ __global__ void kernel_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k,
                                         OutDataType* const __restrict__ p_out_global,
                                         IndexDataType* const __restrict__ p_indices_global)
 {
-    if constexpr(!NeedIndices)
+   if constexpr(!NeedIndices)
    {
        GridwiseReduction::Run(in_grid_desc_m_k,
                               out_grid_desc_m,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -48,6 +48,7 @@ __global__ void
            const DGridDescriptor_MBlock_MPerBlock d_grid_desc_mblock_mperblock,
            const Block2CTileMap block_2_ctile_map)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,
@@ -66,6 +67,7 @@ __global__ void
                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
                                                   d_grid_desc_mblock_mperblock,
                                                   block_2_ctile_map);
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <typename FloatAB,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -38,6 +38,7 @@ __global__ void
                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
                                    const Block2CTileMap block_2_ctile_map)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,
@@ -51,6 +52,7 @@ __global__ void
                                                   b_grid_desc_bk0_n_bk1,
                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
                                                   block_2_ctile_map);
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <typename FloatAB,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -39,6 +39,7 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,
@@ -52,6 +53,7 @@ __global__ void
                                                   b_element_op,
                                                   c_element_op,
                                                   block_2_ctile_map);
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <typename GridwiseGemm,
@@ -74,6 +76,7 @@ __global__ void
            const BElementwiseOperation b_element_op,
            const CElementwiseOperation c_element_op)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();
@@ -126,6 +129,7 @@ __global__ void
        gemm_desc_ptr[group_id].block_2_ctile_map_,
        block_id_grp);
 #endif
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <index_t BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -37,6 +37,7 @@ __global__ void
                                const CElementwiseOperation c_element_op,
                                const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    constexpr index_t shared_block_size =
        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);

@@ -53,6 +54,7 @@ __global__ void
                                                  b_element_op,
                                                  c_element_op,
                                                  c_block_cluster_adaptor);
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <index_t BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -39,6 +39,7 @@ __global__ void
                                  const CElementwiseOperation c_element_op,
                                  const CBlockClusterAdaptor c_block_cluster_adaptor)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    constexpr index_t shared_block_size =
        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);

@@ -55,6 +56,7 @@ __global__ void
                                                  b_element_op,
                                                  c_element_op,
                                                  c_block_cluster_adaptor);
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <index_t BlockSize,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -42,6 +42,7 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainK0BlockLoop>(
@@ -56,6 +57,7 @@ __global__ void
        b_element_op,
        c_element_op,
        block_2_ctile_map);
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -45,6 +45,7 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainK0BlockLoop>(
@@ -61,6 +62,7 @@ __global__ void
        b_element_op,
        c_element_op,
        block_2_ctile_map);
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -49,6 +49,7 @@ __global__ void
            const CElementwiseOperation c_element_op,
            const Block2CTileMap block_2_ctile_map)
 {
+#if (defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainK0BlockLoop>(
@@ -67,6 +68,7 @@ __global__ void
        b_element_op,
        c_element_op,
        block_2_ctile_map);
+#endif //end of if (defined(__gfx908__) || defined(__gfx90a__))
 }

 template <

--- a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
@@ -36,6 +36,7 @@ __global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffe
                                        DataType value)

 {
+
    using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<DataType, DataType>;

    constexpr auto I0 = Number<0>{};

--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -10,7 +10,7 @@ cmake
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
 -D BUILD_DEV=OFF                                                                                                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD"         \
+-D CMAKE_CXX_FLAGS=" -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD"         \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \