add heat_map

9e0d6146 · Chao Liu · e69b1970 · 9e0d6146 · 9e0d6146 · 9e0d6146
Commit 9e0d6146 authored Apr 18, 2020 by Chao Liu
4 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -45,7 +45,7 @@ template <index_t GridSize,
          index_t GemmCThreadCopyDstDataPerWrite_GemmN1>
 struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
 {
-    __device__ void Run(const Float* const __restrict__ p_in_global,
+    __host__ __device__ void Run(const Float* const __restrict__ p_in_global,
                        const Float* const __restrict__ p_wei_global,
                        Float* const __restrict__ p_out_global) const
    {

--- a/composable_kernel/include/tensor_operation/gridwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm.hpp
@@ -9,6 +9,8 @@
 #include "threadwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"

+#include<fstream>
+
 namespace ck {

 template <index_t GridSize,
@@ -385,6 +387,88 @@ struct GridwiseGemmTransposedANormalBNormalC_v1

        Run(p_a_global, p_b_global, p_c_global, p_shared_block);
    }
+
+
+    __host__ void Run(const Float* __restrict__ p_a_global,
+                      const Float* __restrict__ p_b_global,
+                      Float* __restrict__ p_c_global) const
+    {
+        constexpr auto a_k_m_global_desc = AGlobalDesc{};
+        constexpr auto b_k_n_global_desc = BGlobalDesc{};
+        constexpr auto c_m_n_global_desc = CGlobalDesc{};
+
+        constexpr auto K = a_k_m_global_desc.GetLengths()[0];
+        constexpr auto M = a_k_m_global_desc.GetLengths()[1];
+        constexpr auto N = b_k_n_global_desc.GetLengths()[1];
+
+        constexpr index_t MBlockWork = M / MPerBlock;
+        constexpr index_t NBlockWork = N / NPerBlock;
+        constexpr index_t KBlockWork = K / KPerBlock;
+
+        using ACoord = typename TensorCoordinate<AGlobalDesc>::type;
+        using BCoord = typename TensorCoordinate<BGlobalDesc>::type;
+
+
+        for(index_t m_block_work_id = 0; m_block_work_id < MBlockWork; ++m_block_work_id)
+        {
+            for(index_t n_block_work_id = 0; n_block_work_id < NBlockWork; ++n_block_work_id)
+            {
+                // A matrix
+                {
+                    std::fstream afile;
+
+                    afile.open("a_mblock_" + std::to_string(m_block_work_id) + "_nblock_" + std::to_string(n_block_work_id) + ".csv", std::fstream::out);
+
+                    afile << "kblock, offset" << std::endl;
+
+                    for(index_t k_block_work_id = 0; k_block_work_id < KBlockWork; ++k_block_work_id)
+                    {
+                        for(index_t k = k_block_work_id * KPerBlock ; k < (k_block_work_id + 1) * KPerBlock; ++k)
+                        {
+                            for(index_t m = m_block_work_id * MPerBlock ; m < (m_block_work_id + 1) * MPerBlock; ++m)
+                            {
+                                auto a_coord = ACoord({k, m});
+
+                                if(a_coord.IsOffsetValidAssumingUpperIndexIsValid())
+                                {
+                                    afile << k_block_work_id * 100 << "," << a_coord.GetOffset() << std::endl;
+                                }
+                            }
+                        }
+                    }
+
+                    afile.close();
+                }
+
+                // B matrix
+                {
+                    std::fstream bfile;
+
+                    bfile.open("b_mblock_" + std::to_string(m_block_work_id) + "_nblock_" + std::to_string(n_block_work_id) + ".csv", std::fstream::out);
+
+                    bfile << "kblock, offset" << std::endl;
+
+                    for(index_t k_block_work_id = 0; k_block_work_id < KBlockWork; ++k_block_work_id)
+                    {
+                        for(index_t k = k_block_work_id * KPerBlock ; k < (k_block_work_id + 1) * KPerBlock; ++k)
+                        {
+                            for(index_t n = n_block_work_id * NPerBlock ; n < (n_block_work_id + 1) * NPerBlock; ++n)
+                            {
+                                auto b_coord = BCoord({k, n});
+
+                                if(b_coord.IsOffsetValidAssumingUpperIndexIsValid())
+                                {
+                                    bfile << k_block_work_id * 100<< "," << b_coord.GetOffset() << std::endl;
+                                }
+                            }
+                        }
+                    }
+
+                    bfile.close();
+                }
+            }
+        }
+    }
 };

 } // namespace ck

--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -118,7 +118,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;

    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 256, 128x128x8
    constexpr index_t BlockSize = 256;

@@ -1002,7 +1002,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,

    printf("Start running %d times...\n", nrepeat);

-    cudaDeviceSynchronize();
+    hipDeviceSynchronize();
    auto start = std::chrono::steady_clock::now();

    for(index_t i = 0; i < nrepeat; ++i)
@@ -1018,7 +1018,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,

    } 

-    cudaDeviceSynchronize();
+    hipDeviceSynchronize();
    auto end = std::chrono::steady_clock::now();
    
    float ave_time = std::chrono::duration<float, std::milli>(end - start).count() / nrepeat;
@@ -1029,4 +1029,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
 		   (std::size_t(1000) * 1000 * 1000) / ave_time);

    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+
+    gridwise_conv.Run(
+                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
 }
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -18,7 +18,7 @@
 //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
 //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
+//#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"

 int main(int argc, char* argv[])
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 3>;
    using RightPads = Sequence<0, 3>;
-#elif 1
+#elif 0
    // 3x3, 299x299 stride=2
    constexpr index_t N  = 128;
    constexpr index_t C  =   3;
@@ -267,7 +267,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<1, 0>;
    using RightPads = Sequence<1, 0>;
-#elif 1
+#elif 0
    // 3x3, 147x147
    constexpr index_t N  = 128;
    constexpr index_t C  =  64;
@@ -298,7 +298,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<3, 0>;
    using RightPads = Sequence<3, 0>;
-#elif 1
+#elif 0
    // 3x3, 73x73
    constexpr index_t N  = 128;
    constexpr index_t C  =  64;
@@ -331,10 +331,10 @@ int main(int argc, char* argv[])
 #elif 0
    // 1x1, 14x14
    constexpr index_t N  = 128;
-    constexpr index_t C  =  1024;
+    constexpr index_t C  =  128;
    constexpr index_t HI =  14;
    constexpr index_t WI =  14;
-    constexpr index_t K  =  256;
+    constexpr index_t K  =  128;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

@@ -373,13 +373,13 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<1, 1>;
    using RightPads = Sequence<1, 1>;
-#elif 0
+#elif 1
    // 3x3, 14x14
    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
+    constexpr index_t C  = 128;
    constexpr index_t HI =  14;
    constexpr index_t WI =  14;
-    constexpr index_t K  = 256;
+    constexpr index_t K  = 128;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;