Commit f35c64eb authored by Chao Liu's avatar Chao Liu
Browse files

experimenting

parent 22114959
...@@ -160,7 +160,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -160,7 +160,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr unsigned WeiBlockCopyDataPerRead = 4; constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1 #elif 0
// 1x1, 28x28, 256 thread // 1x1, 28x28, 256 thread
constexpr unsigned BPerBlock = 128; constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 128; constexpr unsigned KPerBlock = 128;
...@@ -211,7 +211,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, ...@@ -211,7 +211,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
for(unsigned i = 0; i < nrepeat; ++i) for(unsigned i = 0; i < nrepeat; ++i)
{ {
float time = launch_kernel( float time = launch_kernel(
#if 0 #if 1
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
#else #else
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
......
...@@ -661,9 +661,9 @@ int main(int argc, char* argv[]) ...@@ -661,9 +661,9 @@ int main(int argc, char* argv[])
device_direct_convolution_2_nchw_kcyx_nkhw device_direct_convolution_2_nchw_kcyx_nkhw
#elif 0 #elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 0 #elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 1
device_implicit_gemm_convolution_2_chwn_cyxk_khwn device_implicit_gemm_convolution_2_chwn_cyxk_khwn
#endif #endif
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
......
This diff is collapsed.
This diff is collapsed.
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "blockwise_2d_tensor_op.hip.hpp" #include "blockwise_2d_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp" #include "threadwise_nd_tensor_op.hip.hpp"
#include "threadwise_4d_tensor_op.hip.hpp" #include "threadwise_4d_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp" #include "blockwise_batched_gemm.hip.hpp"
template <unsigned GridSize, template <unsigned GridSize,
unsigned BlockSize, unsigned BlockSize,
...@@ -211,8 +211,6 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric ...@@ -211,8 +211,6 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
{ {
#if 0 #if 0
blockwise_batch_gemm.Run blockwise_batch_gemm.Run
#elif 0
blockwise_batch_gemm.Run_v2
#elif 1 #elif 1
blockwise_batch_gemm.Run_v3 blockwise_batch_gemm.Run_v3
#endif #endif
......
...@@ -236,15 +236,19 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric ...@@ -236,15 +236,19 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
for(unsigned x = 0; x < X; ++x) for(unsigned x = 0; x < X; ++x)
{ {
auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 1 #if 0
blockwise_gemm.Run blockwise_gemm.Run
#else #elif 1
blockwise_gemm.Run_asm
#elif 0
blockwise_gemm.Run_v2
#elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#endif #endif
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block + y * Wi + x, p_in_block + y * Wi + x,
p_out_thread, p_out_thread,
f_accum); f_accum);
} }
} }
} }
......
...@@ -34,7 +34,11 @@ template <unsigned GridSize, ...@@ -34,7 +34,11 @@ template <unsigned GridSize,
unsigned WeiBlockCopyThreadPerDim1, unsigned WeiBlockCopyThreadPerDim1,
unsigned InBlockCopyDataPerRead, unsigned InBlockCopyDataPerRead,
unsigned WeiBlockCopyDataPerRead> unsigned WeiBlockCopyDataPerRead>
__global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer( __global__ void
#if 0
__launch_bounds__(256,2)
#endif
gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer(
const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_in_global,
const Float* const __restrict__ p_wei_global, const Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global) Float* const __restrict__ p_out_global)
...@@ -280,15 +284,15 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b ...@@ -280,15 +284,15 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
for(unsigned x = 0; x < X; ++x) for(unsigned x = 0; x < X; ++x)
{ {
auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 1 #if 0
blockwise_gemm.Run blockwise_gemm.Run
#else #else
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#endif #endif
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block_now + y * Wi + x, p_in_block_now + y * Wi + x,
p_out_thread, p_out_thread,
f_accum); f_accum);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment