clean up for miopen

23f633cd · Chao Liu · 3c0646d4 · 23f633cd · 23f633cd · 23f633cd
Commit 23f633cd authored Jun 17, 2019 by Chao Liu
12 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -66,10 +66,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
        constexpr auto I1 = Number<1>{};
        constexpr auto I2 = Number<2>{};
        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
        constexpr auto I5 = Number<5>{};
-        constexpr auto I6 = Number<6>{};
-        constexpr auto I7 = Number<7>{};
        constexpr auto True = integral_constant<bool, true>{};
@@ -77,10 +74,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
        constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
        constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
-        constexpr index_t N  = in_n_c_h_w_global_desc.GetLength(I0);
+        constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0);
-        constexpr index_t C  = in_n_c_h_w_global_desc.GetLength(I1);
+        constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1);
-        constexpr index_t Hi = in_n_c_h_w_global_desc.GetLength(I2);
-        constexpr index_t Wi = in_n_c_h_w_global_desc.GetLength(I3);
        constexpr index_t K  = out_n_k_h_w_global_desc.GetLength(I1);
        constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2);
@@ -346,7 +341,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
        {
            constexpr index_t K2 = GemmMPerThreadSubC;
            constexpr index_t K1 = GemmMLevel0Cluster * GemmMLevel1Cluster;
-            constexpr index_t K0 = K / (K1 * K2);
            // define tensor descriptor for threadwise copy
            //     output memory layout descriptor in register

--- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
@@ -54,12 +54,8 @@ __host__ __device__ constexpr auto
 template <class TDesc>
 __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s)
 {
-    const auto desc = TDesc{};
+    printf(
+        "%s NRow %u NCol %u RowStride %u\n", s, TDesc::NRow(), TDesc::NCol(), TDesc::RowStride());
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride());
 }
 } // namespace ck

--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -49,7 +49,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        constexpr index_t M = BlockMatrixA::NCol(); // A is transposed
        constexpr index_t N = BlockMatrixB::NCol();
-        constexpr index_t K = BlockMatrixA::NRow();
        static_assert(M % (MPerThreadSubC * MLevel0Cluster * MLevel1Cluster) == 0 &&
                          N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
@@ -100,12 +99,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
    {
        constexpr auto c_thread_mtx = ThreadMatrixC{};
-        constexpr index_t MPerThread = c_thread_mtx.NRow();
-        constexpr index_t NPerThread = c_thread_mtx.NCol();
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
        constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster;
        constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
@@ -125,9 +118,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                                const FloatB* __restrict__ p_b_block,
                                FloatC* __restrict__ p_c_thread) const
    {
-        constexpr auto True  = integral_constant<bool, true>{};
-        constexpr auto False = integral_constant<bool, false>{};
        constexpr auto a_block_mtx  = BlockMatrixA{};
        constexpr auto b_block_mtx  = BlockMatrixB{};
        constexpr auto c_thread_mtx = ThreadMatrixC{};
@@ -146,13 +136,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        constexpr auto b_thread_mtx =
            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
-        // thread A-sub, B-sub for copy
-        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
-        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
        FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
        FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
@@ -172,9 +155,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        using Float4 = vector_type<float, 4>::MemoryType;
-        Float4* reg_a = (Float4*)(p_a_thread);
+        Float4* reg_a = reinterpret_cast<Float4*>(p_a_thread);
-        Float4* reg_b = (Float4*)(p_b_thread);
+        Float4* reg_b = reinterpret_cast<Float4*>(p_b_thread);
-        Float4* reg_c = (Float4*)(p_c_thread);
+        Float4* reg_c = reinterpret_cast<Float4*>(p_c_thread);
        reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA]);
        reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB]);
@@ -215,8 +198,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        constexpr auto b_block_mtx  = BlockMatrixB{};
        constexpr auto c_thread_mtx = ThreadMatrixC{};
-        constexpr index_t M = a_block_mtx.NCol();
-        constexpr index_t N = b_block_mtx.NCol();
        constexpr index_t K = a_block_mtx.NRow();
        constexpr index_t MPerThread = c_thread_mtx.NRow();
@@ -245,8 +226,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
-        const FloatA* const p_a_block_thread_offset = p_a_block + mMyThreadOffsetA;
 #pragma unroll
        // loop over k
        for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
@@ -306,8 +285,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        constexpr auto b_block_mtx  = BlockMatrixB{};
        constexpr auto c_thread_mtx = ThreadMatrixC{};
-        constexpr index_t M = a_block_mtx.NCol();
-        constexpr index_t N = b_block_mtx.NCol();
        constexpr index_t K = a_block_mtx.NRow();
        constexpr index_t MPerThread = c_thread_mtx.NRow();

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -93,8 +93,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
                          "wrong! cannot evenly divide sliced tensor into cluster");
        });
-        constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
        // for now, only support SubLengths == 1 on a merged dimension that constains
        // multiple original dimensions
        static_for<0, nDim, 1>{}([&](auto IDim_) {
@@ -297,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
        constexpr auto IDim    = Number<IDim_>{};
        constexpr index_t idim = IDim;
-        static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto fwd) {
+        static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
            // logic for a merged dimension, also works for non-merged dimension, but its logic may
            // be unncessarily complicated for compiler to remove calculations that are useless for
            // a non-merged dimension
@@ -337,7 +335,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
            // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
            mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
-        }).Else([&](auto fwd) {
+        }).Else([&](auto) {
            // Logic for non-merged dimension. If you are never going to move the slicing window on
            // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
            // which are being calculated here, will never be used later. In this case, compiler

--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -71,7 +71,7 @@ __device__ void threadwise_gemm(MatrixA,
                                integral_constant<bool, TransC>,
                                FloatC* __restrict__ p_c_thread)
 {
-    static_if<TransA && (!TransB) && (!TransC)>{}([&](auto fwd) {
+    static_if<TransA && (!TransB) && (!TransC)>{}([&](auto) {
        constexpr auto a_mtx = MatrixA{};
        constexpr auto b_mtx = MatrixB{};
        constexpr auto c_mtx = MatrixC{};

--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -11,6 +11,7 @@
 namespace ck {
+// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
 template <class Float,
          class SrcDesc,
          class DstDesc,

--- a/composable_kernel/include/utility/Sequence.hpp
+++ b/composable_kernel/include/utility/Sequence.hpp
@@ -253,7 +253,7 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
 }
 template <index_t... Xs, index_t... Ys>
-__host__ __device__ constexpr auto operator-(Sequence<Xs...> seq_x, Sequence<Ys...> seq_y)
+__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Sequence<Ys...>)
 {
    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");

--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -102,7 +102,7 @@ __device__ void outerProduct1x4(const float& a,
                                const vector_type<float, 4>::MemoryType& b,
                                vector_type<float, 4>::MemoryType& c)
 {
-    outerProduct1x4(&a, (float*)&b, (float*)&c);
+    outerProduct1x4(&a, reinterpret_cast<const float*>(&b), reinterpret_cast<float*>(&c));
 }
 __device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,

--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -19,6 +19,10 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
 using index_t = uint32_t;
+__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
+__device__ index_t get_block_1d_id() { return blockIdx.x; }
 __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
 {
    d += s0 * s1;

--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -23,6 +23,10 @@ using float4_t = float4;
 using index_t = uint32_t;
+__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
+__device__ index_t get_block_1d_id() { return blockIdx.x; }
 __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
 {
    d += s0 * s1;

--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -18,7 +18,7 @@ struct forwarder
 struct swallow
 {
    template <class... Ts>
-    __host__ __device__ constexpr swallow(Ts&&... ts)
+    __host__ __device__ constexpr swallow(Ts&&...)
    {
    }
 };

--- a/composable_kernel/include/utility/utility.hpp
+++ b/composable_kernel/include/utility/utility.hpp
@@ -6,10 +6,6 @@
 namespace ck {
-__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
-__device__ index_t get_block_1d_id() { return blockIdx.x; }
 template <class X, class Y>
 using is_same = std::is_same<X, Y>;