Commit 67c6f73f authored by Chao Liu's avatar Chao Liu
Browse files

hip build

parent 121693b3
#pragma once #pragma once
#include <memory> #include <memory>
#include "config.h"
struct DeviceMem struct DeviceMem
{ {
...@@ -27,4 +28,31 @@ struct KernelTimer ...@@ -27,4 +28,31 @@ struct KernelTimer
std::unique_ptr<KernelTimerImpl> impl; std::unique_ptr<KernelTimerImpl> impl;
}; };
void launch_kernel(const void* func, dim3 grid_dim, dim3 block_dim, void** args, float& time); template <typename... Args, typename F>
float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, Args... args)
{
KernelTimer timer;
#if DEVICE_BACKEND_HIP
timer.Start();
hipLaunchKernelGGL(kernel, grid_dim, block_dim, 0, 0, args...);
timer.End();
hipGetErrorString(hipGetLastError());
#elif DEVICE_BACKEND_CUDA
const void* f = reinterpret_cast<const void*>(kernel);
void* p_args = {&args...};
timer.Start();
cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, 0, 0);
timer.End();
checkCudaErrors(error);
#endif
return timer.GetElapsedTime();
}
#pragma once #pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh" #include "blockwise_4d_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh" #include "blockwise_direct_convolution.cuh"
...@@ -146,7 +147,8 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -146,7 +147,8 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
c_block_work_begin += CPerBlock) c_block_work_begin += CPerBlock)
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, blockwise_in_copy.Run(p_in_global +
in_global_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin, c_block_work_begin,
hi_block_work_begin, hi_block_work_begin,
wi_block_work_begin), wi_block_work_begin),
...@@ -176,9 +178,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -176,9 +178,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
} }
// copy output tensor from LDS to device mem // copy output tensor from LDS to device mem
blockwise_out_copy.Run(p_out_block, blockwise_out_copy.Run(
p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, p_out_block,
k_block_work_begin, p_out_global +
ho_block_work_begin, out_global_desc.Get1dIndex(
wo_block_work_begin)); n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
} }
#pragma once #pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh" #include "blockwise_4d_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh" #include "blockwise_direct_convolution.cuh"
...@@ -162,7 +163,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -162,7 +163,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, blockwise_in_copy.Run(p_in_global +
in_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin, c_block_data_begin,
hi_block_data_begin, hi_block_data_begin,
wi_block_data_begin), wi_block_data_begin),
...@@ -177,11 +179,12 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -177,11 +179,12 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
{ {
// threadwise convolution // threadwise convolution
#if 1 #if 1
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_thread_block_desc, in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
...@@ -192,7 +195,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -192,7 +195,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#elif 0 #elif 0
threadwise_direct_convolution_3( threadwise_direct_convolution_3(
in_thread_block_desc, in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, p_in_block +
in_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
...@@ -209,7 +213,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_ ...@@ -209,7 +213,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
out_thread_desc, out_thread_desc,
p_out_thread, p_out_thread,
out_global_desc, out_global_desc,
p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, p_out_global +
out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin, k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin), wo_block_data_begin + wo_thread_data_begin),
......
...@@ -121,8 +121,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric ...@@ -121,8 +121,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
decltype(in_chwn_block_desc), decltype(in_chwn_block_desc),
decltype(in_chwn_block_desc.GetLengths())>{}; decltype(in_chwn_block_desc.GetLengths())>{};
// blockwise wei copy // blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock] // format is [CPerBlock*S*R,KPerBlock]
#if 0 #if 0
const auto blockwise_wei_copy = const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
...@@ -199,7 +199,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric ...@@ -199,7 +199,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread); threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
const Float* p_in_global_block_begin = const Float* p_in_global_block_begin =
p_in_global + in_chwn_global_desc.Get1dIndex( p_in_global +
in_chwn_global_desc.Get1dIndex(
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
const Float* p_wei_global_block_begin = const Float* p_wei_global_block_begin =
...@@ -257,7 +258,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric ...@@ -257,7 +258,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
out_hkwn_thread_desc, out_hkwn_thread_desc,
p_out_thread, p_out_thread,
out_khwn_global_desc, out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, p_out_global +
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin), n_block_data_begin + n_thread_data_begin),
......
...@@ -283,7 +283,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded( ...@@ -283,7 +283,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
out_hkwn_thread_desc, out_hkwn_thread_desc,
p_out_thread, p_out_thread,
out_khwn_global_desc, out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, p_out_global +
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin), n_block_data_begin + n_thread_data_begin),
......
...@@ -256,7 +256,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p ...@@ -256,7 +256,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0; Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0;
Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0; Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0;
// preload next data // preload next data
#if 1 #if 1
// input: global mem to LDS, // input: global mem to LDS,
blockwise_in_copy.Run(p_in_global, blockwise_in_copy.Run(p_in_global,
...@@ -339,7 +339,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p ...@@ -339,7 +339,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
out_hkwn_thread_desc, out_hkwn_thread_desc,
p_out_thread, p_out_thread,
out_khwn_global_desc, out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, p_out_global +
out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin), n_block_data_begin + n_thread_data_begin),
......
...@@ -160,7 +160,8 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric ...@@ -160,7 +160,8 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N] // convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>( blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
in_nchw_global_desc, in_nchw_global_desc,
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, p_in_global +
in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin, c_block_data_begin,
hi_block_data_begin, hi_block_data_begin,
wi_block_data_begin), wi_block_data_begin),
...@@ -244,7 +245,8 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric ...@@ -244,7 +245,8 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_hkwn_thread_desc, out_hkwn_thread_desc,
p_out_thread, p_out_thread,
out_nkhw_global_desc, out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin, p_out_global +
out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
k_block_data_begin + k_thread_data_begin, k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin), wo_block_data_begin + wo_thread_data_begin),
...@@ -261,7 +263,8 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric ...@@ -261,7 +263,8 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_nkhw_thread_desc, out_nkhw_thread_desc,
p_out_thread, p_out_thread,
out_nkhw_global_desc, out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin, p_out_global +
out_nkhw_global_desc.Get1dIndex(n_block_data_begin,
k_block_data_begin + k_thread_data_begin, k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin), wo_block_data_begin + wo_thread_data_begin),
......
...@@ -166,7 +166,8 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric ...@@ -166,7 +166,8 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N] // convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>( blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
in_nchw_global_desc, in_nchw_global_desc,
p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, p_in_global +
in_nchw_global_desc.Get1dIndex(n_block_data_begin,
c_block_data_begin, c_block_data_begin,
hi_block_data_begin, hi_block_data_begin,
wi_block_data_begin), wi_block_data_begin),
...@@ -179,8 +180,9 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric ...@@ -179,8 +180,9 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
#if 1 #if 1
// weight: global mem to LDS, // weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed // format is [S,R,C,K], no conversion needed
blockwise_wei_copy.Run(p_wei_global + wei_srck_global_desc.Get1dIndex( blockwise_wei_copy.Run(
0, 0, c_block_data_begin, k_block_data_begin), p_wei_global +
wei_srck_global_desc.Get1dIndex(0, 0, c_block_data_begin, k_block_data_begin),
p_wei_block); p_wei_block);
#endif #endif
...@@ -217,7 +219,8 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric ...@@ -217,7 +219,8 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
out_hkwn_thread_desc, out_hkwn_thread_desc,
p_out_thread, p_out_thread,
out_nkhw_global_desc, out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, p_out_global +
out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
k_block_data_begin + k_thread_data_begin, k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin), wo_block_data_begin + wo_thread_data_begin),
......
...@@ -111,8 +111,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric ...@@ -111,8 +111,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
} }
#endif #endif
// blockwise in copy // blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead] // formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0 #if 0
const auto blockwise_in_copy = const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
...@@ -137,8 +137,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric ...@@ -137,8 +137,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
InBlockCopyDataPerRead>{}; InBlockCopyDataPerRead>{};
#endif #endif
// blockwise wei copy // blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock] // format is [CPerBlock*S*R,KPerBlock]
#if 0 #if 0
const auto blockwise_wei_copy = const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
......
...@@ -111,8 +111,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b ...@@ -111,8 +111,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
} }
#endif #endif
// blockwise in copy // blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead] // formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0 #if 0
const auto blockwise_in_copy = const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
...@@ -137,8 +137,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b ...@@ -137,8 +137,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
InBlockCopyDataPerRead>{}; InBlockCopyDataPerRead>{};
#endif #endif
// blockwise wei copy // blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock] // format is [CPerBlock*S*R,KPerBlock]
#if 0 #if 0
const auto blockwise_wei_copy = const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
...@@ -258,7 +258,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b ...@@ -258,7 +258,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
__syncthreads(); __syncthreads();
// load next data // load next data
#if 0 #if 0
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next); blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next); blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
......
...@@ -103,8 +103,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(const Float* const __restric ...@@ -103,8 +103,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(const Float* const __restric
} }
#endif #endif
// blockwise in copy // blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead] // formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0 #if 0
const auto blockwise_in_copy = const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
......
...@@ -103,8 +103,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline ...@@ -103,8 +103,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
} }
#endif #endif
// in: global mem to LDS // in: global mem to LDS
// formmat is [CPerBlock,BPerBlock + BGhostRead] // formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 1 #if 1
const auto blockwise_in_copy = const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize, Blockwise2dTensorCopy1<BlockSize,
...@@ -129,8 +129,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline ...@@ -129,8 +129,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
decltype(in_cb_block_desc.GetLengths())>{}; decltype(in_cb_block_desc.GetLengths())>{};
#endif #endif
// weight: global mem to LDS, // weight: global mem to LDS,
// format is [S,R,CPerBlock,KPerBlock] // format is [S,R,CPerBlock,KPerBlock]
#if 1 #if 1
const auto blockwise_wei_copy = const auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize, Blockwise4dTensorCopy1<BlockSize,
...@@ -191,7 +191,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline ...@@ -191,7 +191,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
// set threadwise output tensor to 0 // set threadwise output tensor to 0
threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
// prelog: load data // prelog: load data
#if 1 #if 1
// input: global mem to LDS, // input: global mem to LDS,
blockwise_in_copy.Run(p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin), blockwise_in_copy.Run(p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin),
...@@ -220,8 +220,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline ...@@ -220,8 +220,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
#if 1 #if 1
// preload next data // preload next data
// input: global mem to LDS, // input: global mem to LDS,
blockwise_in_copy.Run(p_in_global + in_cb_global_desc.Get1dIndex( blockwise_in_copy.Run(
c_block_data_begin + CPerBlock, b_block_data_begin), p_in_global +
in_cb_global_desc.Get1dIndex(c_block_data_begin + CPerBlock, b_block_data_begin),
p_in_block_next); p_in_block_next);
#endif #endif
......
...@@ -189,15 +189,16 @@ __global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_ ...@@ -189,15 +189,16 @@ __global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_
S, S,
R, R,
OutTileSizeH, OutTileSizeH,
OutTileSizeW>(in_transform_thread_block_desc, OutTileSizeW>(
p_in_transform_block + in_transform_block_desc.Get1dIndex( in_transform_thread_block_desc,
n_thread_data_begin, p_in_transform_block +
in_transform_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
y_thread_data_begin * InTileSizeH, y_thread_data_begin * InTileSizeH,
x_thread_data_begin * InTileSizeW), x_thread_data_begin * InTileSizeW),
wei_transform_thread_block_desc, wei_transform_thread_block_desc,
p_wei_transform_block + wei_transform_block_desc.Get1dIndex( p_wei_transform_block +
k_thread_data_begin, c_thread_data, 0, 0), wei_transform_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
out_transform_thread_desc, out_transform_thread_desc,
p_out_transform_thread); p_out_transform_thread);
} }
......
...@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim) ...@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
return os; return os;
} }
typedef enum typedef enum {
{
Half = 0, Half = 0,
Float = 1, Float = 1,
} DataType_t; } DataType_t;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment