Commit 9e0d6146 authored by Chao Liu's avatar Chao Liu
Browse files

add heat_map

parent e69b1970
......@@ -45,7 +45,7 @@ template <index_t GridSize,
index_t GemmCThreadCopyDstDataPerWrite_GemmN1>
struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
{
__device__ void Run(const Float* const __restrict__ p_in_global,
__host__ __device__ void Run(const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global) const
{
......
......@@ -9,6 +9,8 @@
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
#include<fstream>
namespace ck {
template <index_t GridSize,
......@@ -385,6 +387,88 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
Run(p_a_global, p_b_global, p_c_global, p_shared_block);
}
__host__ void Run(const Float* __restrict__ p_a_global,
const Float* __restrict__ p_b_global,
Float* __restrict__ p_c_global) const
{
constexpr auto a_k_m_global_desc = AGlobalDesc{};
constexpr auto b_k_n_global_desc = BGlobalDesc{};
constexpr auto c_m_n_global_desc = CGlobalDesc{};
constexpr auto K = a_k_m_global_desc.GetLengths()[0];
constexpr auto M = a_k_m_global_desc.GetLengths()[1];
constexpr auto N = b_k_n_global_desc.GetLengths()[1];
constexpr index_t MBlockWork = M / MPerBlock;
constexpr index_t NBlockWork = N / NPerBlock;
constexpr index_t KBlockWork = K / KPerBlock;
using ACoord = typename TensorCoordinate<AGlobalDesc>::type;
using BCoord = typename TensorCoordinate<BGlobalDesc>::type;
for(index_t m_block_work_id = 0; m_block_work_id < MBlockWork; ++m_block_work_id)
{
for(index_t n_block_work_id = 0; n_block_work_id < NBlockWork; ++n_block_work_id)
{
// A matrix
{
std::fstream afile;
afile.open("a_mblock_" + std::to_string(m_block_work_id) + "_nblock_" + std::to_string(n_block_work_id) + ".csv", std::fstream::out);
afile << "kblock, offset" << std::endl;
for(index_t k_block_work_id = 0; k_block_work_id < KBlockWork; ++k_block_work_id)
{
for(index_t k = k_block_work_id * KPerBlock ; k < (k_block_work_id + 1) * KPerBlock; ++k)
{
for(index_t m = m_block_work_id * MPerBlock ; m < (m_block_work_id + 1) * MPerBlock; ++m)
{
auto a_coord = ACoord({k, m});
if(a_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
afile << k_block_work_id * 100 << "," << a_coord.GetOffset() << std::endl;
}
}
}
}
afile.close();
}
// B matrix
{
std::fstream bfile;
bfile.open("b_mblock_" + std::to_string(m_block_work_id) + "_nblock_" + std::to_string(n_block_work_id) + ".csv", std::fstream::out);
bfile << "kblock, offset" << std::endl;
for(index_t k_block_work_id = 0; k_block_work_id < KBlockWork; ++k_block_work_id)
{
for(index_t k = k_block_work_id * KPerBlock ; k < (k_block_work_id + 1) * KPerBlock; ++k)
{
for(index_t n = n_block_work_id * NPerBlock ; n < (n_block_work_id + 1) * NPerBlock; ++n)
{
auto b_coord = BCoord({k, n});
if(b_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
bfile << k_block_work_id * 100<< "," << b_coord.GetOffset() << std::endl;
}
}
}
}
bfile.close();
}
}
}
}
};
} // namespace ck
......
......@@ -118,7 +118,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
#elif 0
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
constexpr index_t BlockSize = 256;
......@@ -1002,7 +1002,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
printf("Start running %d times...\n", nrepeat);
cudaDeviceSynchronize();
hipDeviceSynchronize();
auto start = std::chrono::steady_clock::now();
for(index_t i = 0; i < nrepeat; ++i)
......@@ -1018,7 +1018,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
}
cudaDeviceSynchronize();
hipDeviceSynchronize();
auto end = std::chrono::steady_clock::now();
float ave_time = std::chrono::duration<float, std::milli>(end - start).count() / nrepeat;
......@@ -1029,4 +1029,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
(std::size_t(1000) * 1000 * 1000) / ave_time);
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
gridwise_conv.Run(
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
}
......@@ -18,7 +18,7 @@
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
int main(int argc, char* argv[])
......@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 3>;
using RightPads = Sequence<0, 3>;
#elif 1
#elif 0
// 3x3, 299x299 stride=2
constexpr index_t N = 128;
constexpr index_t C = 3;
......@@ -267,7 +267,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<1, 0>;
using RightPads = Sequence<1, 0>;
#elif 1
#elif 0
// 3x3, 147x147
constexpr index_t N = 128;
constexpr index_t C = 64;
......@@ -298,7 +298,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>;
#elif 1
#elif 0
// 3x3, 73x73
constexpr index_t N = 128;
constexpr index_t C = 64;
......@@ -331,10 +331,10 @@ int main(int argc, char* argv[])
#elif 0
// 1x1, 14x14
constexpr index_t N = 128;
constexpr index_t C = 1024;
constexpr index_t C = 128;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 256;
constexpr index_t K = 128;
constexpr index_t Y = 1;
constexpr index_t X = 1;
......@@ -373,13 +373,13 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
#elif 1
// 3x3, 14x14
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t C = 128;
constexpr index_t HI = 14;
constexpr index_t WI = 14;
constexpr index_t K = 256;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment