Commit 9e0d6146 authored by Chao Liu's avatar Chao Liu
Browse files

add heat_map

parent e69b1970
...@@ -45,7 +45,7 @@ template <index_t GridSize, ...@@ -45,7 +45,7 @@ template <index_t GridSize,
index_t GemmCThreadCopyDstDataPerWrite_GemmN1> index_t GemmCThreadCopyDstDataPerWrite_GemmN1>
struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
{ {
__device__ void Run(const Float* const __restrict__ p_in_global, __host__ __device__ void Run(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global, const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global) const Float* const __restrict__ p_out_global) const
{ {
......
...@@ -9,6 +9,8 @@ ...@@ -9,6 +9,8 @@
#include "threadwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp" #include "blockwise_gemm.hpp"
#include<fstream>
namespace ck { namespace ck {
template <index_t GridSize, template <index_t GridSize,
...@@ -385,6 +387,88 @@ struct GridwiseGemmTransposedANormalBNormalC_v1 ...@@ -385,6 +387,88 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
Run(p_a_global, p_b_global, p_c_global, p_shared_block); Run(p_a_global, p_b_global, p_c_global, p_shared_block);
} }
__host__ void Run(const Float* __restrict__ p_a_global,
const Float* __restrict__ p_b_global,
Float* __restrict__ p_c_global) const
{
constexpr auto a_k_m_global_desc = AGlobalDesc{};
constexpr auto b_k_n_global_desc = BGlobalDesc{};
constexpr auto c_m_n_global_desc = CGlobalDesc{};
constexpr auto K = a_k_m_global_desc.GetLengths()[0];
constexpr auto M = a_k_m_global_desc.GetLengths()[1];
constexpr auto N = b_k_n_global_desc.GetLengths()[1];
constexpr index_t MBlockWork = M / MPerBlock;
constexpr index_t NBlockWork = N / NPerBlock;
constexpr index_t KBlockWork = K / KPerBlock;
using ACoord = typename TensorCoordinate<AGlobalDesc>::type;
using BCoord = typename TensorCoordinate<BGlobalDesc>::type;
for(index_t m_block_work_id = 0; m_block_work_id < MBlockWork; ++m_block_work_id)
{
for(index_t n_block_work_id = 0; n_block_work_id < NBlockWork; ++n_block_work_id)
{
// A matrix
{
std::fstream afile;
afile.open("a_mblock_" + std::to_string(m_block_work_id) + "_nblock_" + std::to_string(n_block_work_id) + ".csv", std::fstream::out);
afile << "kblock, offset" << std::endl;
for(index_t k_block_work_id = 0; k_block_work_id < KBlockWork; ++k_block_work_id)
{
for(index_t k = k_block_work_id * KPerBlock ; k < (k_block_work_id + 1) * KPerBlock; ++k)
{
for(index_t m = m_block_work_id * MPerBlock ; m < (m_block_work_id + 1) * MPerBlock; ++m)
{
auto a_coord = ACoord({k, m});
if(a_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
afile << k_block_work_id * 100 << "," << a_coord.GetOffset() << std::endl;
}
}
}
}
afile.close();
}
// B matrix
{
std::fstream bfile;
bfile.open("b_mblock_" + std::to_string(m_block_work_id) + "_nblock_" + std::to_string(n_block_work_id) + ".csv", std::fstream::out);
bfile << "kblock, offset" << std::endl;
for(index_t k_block_work_id = 0; k_block_work_id < KBlockWork; ++k_block_work_id)
{
for(index_t k = k_block_work_id * KPerBlock ; k < (k_block_work_id + 1) * KPerBlock; ++k)
{
for(index_t n = n_block_work_id * NPerBlock ; n < (n_block_work_id + 1) * NPerBlock; ++n)
{
auto b_coord = BCoord({k, n});
if(b_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
bfile << k_block_work_id * 100<< "," << b_coord.GetOffset() << std::endl;
}
}
}
}
bfile.close();
}
}
}
}
}; };
} // namespace ck } // namespace ck
......
...@@ -118,7 +118,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -118,7 +118,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1; constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1; constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif 0 #elif 1
// cdata = 64, BlockSize = 256, 128x128x8 // cdata = 64, BlockSize = 256, 128x128x8
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
...@@ -1002,7 +1002,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -1002,7 +1002,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
printf("Start running %d times...\n", nrepeat); printf("Start running %d times...\n", nrepeat);
cudaDeviceSynchronize(); hipDeviceSynchronize();
auto start = std::chrono::steady_clock::now(); auto start = std::chrono::steady_clock::now();
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
...@@ -1018,7 +1018,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -1018,7 +1018,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
} }
cudaDeviceSynchronize(); hipDeviceSynchronize();
auto end = std::chrono::steady_clock::now(); auto end = std::chrono::steady_clock::now();
float ave_time = std::chrono::duration<float, std::milli>(end - start).count() / nrepeat; float ave_time = std::chrono::duration<float, std::milli>(end - start).count() / nrepeat;
...@@ -1029,4 +1029,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, ...@@ -1029,4 +1029,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
(std::size_t(1000) * 1000 * 1000) / ave_time); (std::size_t(1000) * 1000 * 1000) / ave_time);
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data()); out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
gridwise_conv.Run(
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
} }
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
...@@ -130,7 +130,7 @@ int main(int argc, char* argv[]) ...@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 3>; using LeftPads = Sequence<0, 3>;
using RightPads = Sequence<0, 3>; using RightPads = Sequence<0, 3>;
#elif 1 #elif 0
// 3x3, 299x299 stride=2 // 3x3, 299x299 stride=2
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 3; constexpr index_t C = 3;
...@@ -267,7 +267,7 @@ int main(int argc, char* argv[]) ...@@ -267,7 +267,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<1, 0>; using LeftPads = Sequence<1, 0>;
using RightPads = Sequence<1, 0>; using RightPads = Sequence<1, 0>;
#elif 1 #elif 0
// 3x3, 147x147 // 3x3, 147x147
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 64; constexpr index_t C = 64;
...@@ -298,7 +298,7 @@ int main(int argc, char* argv[]) ...@@ -298,7 +298,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<3, 0>; using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>;
#elif 1 #elif 0
// 3x3, 73x73 // 3x3, 73x73
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 64; constexpr index_t C = 64;
...@@ -331,10 +331,10 @@ int main(int argc, char* argv[]) ...@@ -331,10 +331,10 @@ int main(int argc, char* argv[])
#elif 0 #elif 0
// 1x1, 14x14 // 1x1, 14x14
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 1024; constexpr index_t C = 128;
constexpr index_t HI = 14; constexpr index_t HI = 14;
constexpr index_t WI = 14; constexpr index_t WI = 14;
constexpr index_t K = 256; constexpr index_t K = 128;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
...@@ -373,13 +373,13 @@ int main(int argc, char* argv[]) ...@@ -373,13 +373,13 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<1, 1>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>; using RightPads = Sequence<1, 1>;
#elif 0 #elif 1
// 3x3, 14x14 // 3x3, 14x14
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 256; constexpr index_t C = 128;
constexpr index_t HI = 14; constexpr index_t HI = 14;
constexpr index_t WI = 14; constexpr index_t WI = 14;
constexpr index_t K = 256; constexpr index_t K = 128;
constexpr index_t Y = 3; constexpr index_t Y = 3;
constexpr index_t X = 3; constexpr index_t X = 3;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment