Commit 418ca5ee authored by root's avatar root
Browse files

add nc0hwc1

parent 14f22aa6
...@@ -135,6 +135,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -135,6 +135,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto KPerThreadSubC = 4; constexpr auto KPerThreadSubC = 4;
static_assert(KPerThread % KPerThreadSubC == 0, ""); static_assert(KPerThread % KPerThreadSubC == 0, "");
static_assert(HPerThread % 2 == 0, "");
static_assert(WPerThread % 2 == 0, "");
// thread A, B for GEMM // thread A, B for GEMM
constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
...@@ -164,16 +166,23 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -164,16 +166,23 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
#pragma unroll #pragma unroll
for(index_t k_begin = 0; k_begin < KPerThread; k_begin += KPerThreadSubC) for(index_t k_begin = 0; k_begin < KPerThread; k_begin += KPerThreadSubC)
{ {
a_thread_copy.Run(p_a_block + a_thread_copy.Run(p_a_block +
a_block_mtx.CalculateOffset(make_tuple(e_begin, k_begin)) + a_block_mtx.CalculateOffset(make_tuple(e_begin, k_begin)) +
mMyThreadOffsetA, mMyThreadOffsetA,
p_a_thread); p_a_thread);
threadwise_gemm.Run( for(index_t h_begin = 0; h_begin < HPerThread; h_begin += 2)
p_a_thread, {
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(e_begin, 0, 0, 0)),
p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(k_begin, 0, 0, 0))); for(index_t w_begin = 0; w_begin < WPerThread; w_begin += 2)
{
threadwise_gemm.Run(p_a_thread,
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(
e_begin, 0, h_begin, w_begin)),
p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(
k_begin, 0, h_begin, w_begin)));
}
}
} }
} }
} }
......
...@@ -54,8 +54,10 @@ struct ThreadwiseGemm_km_kn_mn_v3 ...@@ -54,8 +54,10 @@ struct ThreadwiseGemm_km_kn_mn_v3
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto H = BDesc{}.GetLength(I2); // constexpr auto H = BDesc{}.GetLength(I2);
constexpr auto W = BDesc{}.GetLength(I3); // constexpr auto W = BDesc{}.GetLength(I3);
constexpr auto H = 2;
constexpr auto W = 2;
constexpr auto E = ADesc{}.GetLength(I0); constexpr auto E = ADesc{}.GetLength(I0);
constexpr auto K = ADesc{}.GetLength(I1); constexpr auto K = ADesc{}.GetLength(I1);
......
...@@ -29,16 +29,33 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -29,16 +29,33 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
{ {
using namespace ck; using namespace ck;
std::cout << "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw" std::cout << "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw"
<< std::endl; << std::endl;
DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace()); DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace()); DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace()); DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); constexpr auto I0 = Number<0>{};
wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data()); constexpr auto I1 = Number<1>{};
out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data()); constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto N = OutDesc::GetLengths()[I0];
constexpr auto K = OutDesc::GetLengths()[I1];
constexpr auto C = WeiDesc::GetLengths()[I1];
constexpr auto Hi = InDesc::GetLengths()[I2];
constexpr auto Wi = InDesc::GetLengths()[I3];
constexpr auto Ho = OutDesc::GetLengths()[I2];
constexpr auto Wo = OutDesc::GetLengths()[I3];
constexpr auto Y = WeiDesc::GetLengths()[I2];
constexpr auto X = WeiDesc::GetLengths()[I3];
constexpr auto C0 = C / Number<InWeiVectorSize>{};
constexpr auto C1 = Number<InWeiVectorSize>{};
#if 0 #if 0
// run-time variables // run-time variables
...@@ -55,12 +72,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -55,12 +72,12 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
const auto in_right_pads = to_multi_index(InRightPads{}); const auto in_right_pads = to_multi_index(InRightPads{});
#else #else
// compile-time variables // compile-time variables
const auto in_n_c_hi_wi_desc = make_dynamic_naive_tensor_descriptor_packed_v2( const auto in_n_c0_hi_wi_desc =
sequence_to_tuple_of_number(InDesc::GetLengths())); make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
const auto wei_k_c_y_x_desc = make_dynamic_naive_tensor_descriptor_packed_v2( const auto wei_k_c0_y_x_desc =
sequence_to_tuple_of_number(WeiDesc::GetLengths())); make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
const auto out_n_k_ho_wo_desc = make_dynamic_naive_tensor_descriptor_packed_v2( const auto out_n_k_ho_wo_desc =
sequence_to_tuple_of_number(OutDesc::GetLengths())); make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo));
const auto conv_strides = sequence_to_tuple_of_number(ConvStrides{}); const auto conv_strides = sequence_to_tuple_of_number(ConvStrides{});
const auto conv_dilations = sequence_to_tuple_of_number(ConvDilations{}); const auto conv_dilations = sequence_to_tuple_of_number(ConvDilations{});
...@@ -68,21 +85,43 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -68,21 +85,43 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
const auto in_right_pads = sequence_to_tuple_of_number(InRightPads{}); const auto in_right_pads = sequence_to_tuple_of_number(InRightPads{});
#endif #endif
Tensor<TInWei> in_n_c0_hi_wi_c1(make_HostTensorDescriptor(
make_native_tensor_descriptor_packed(Sequence<N, C0, Hi, Wi, C1>{})));
Tensor<TInWei> wei_k_c0_y_x_c1(make_HostTensorDescriptor(
make_native_tensor_descriptor_packed(Sequence<K, C0, Y, X, C1>{})));
auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
in_n_c_hi_wi(n, c, hi, wi);
};
auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
wei_k_c_y_x(k, c, y, x);
};
make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
in_n_c_hi_wi_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
wei_k_c_y_x_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
// out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
// cdata = 16, BlockSize = 64, 16x64x4 // cdata = 16, BlockSize = 64, 16x64x4
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
constexpr index_t KPerBlock = 4; constexpr index_t KPerBlock = 16;
constexpr index_t HoPerBlock = 16; constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 16; constexpr index_t WoPerBlock = 32;
constexpr index_t EPerBlock = 1; constexpr index_t EPerBlock = 1;
constexpr index_t KPerThread = 4; constexpr index_t KPerThread = 16;
constexpr index_t HoPerThread = 2; constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2; constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1; constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E_K = Sequence<1, 1>; using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>;
using ABlockTransferThreadClusterLengths_E_K = Sequence<9, 4>; using ABlockTransferThreadClusterLengths_E_K = Sequence<4, 16>;
constexpr index_t ABlockTransferSrcScalarPerVector_E = 1; constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
constexpr index_t ABlockTransferDstScalarPerVector_K = 1; constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
...@@ -112,8 +151,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -112,8 +151,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
BThreadTransferSrcScalarPerVector_W, BThreadTransferSrcScalarPerVector_W,
CThreadTransferDstScalarPerVector_W>{}; CThreadTransferDstScalarPerVector_W>{};
conv_driver.Run(wei_k_c_y_x_desc, conv_driver.Run(wei_k_c0_y_x_desc,
in_n_c_hi_wi_desc, in_n_c0_hi_wi_desc,
out_n_k_ho_wo_desc, out_n_k_ho_wo_desc,
conv_strides, conv_strides,
conv_dilations, conv_dilations,
......
...@@ -62,11 +62,11 @@ int main(int argc, char* argv[]) ...@@ -62,11 +62,11 @@ int main(int argc, char* argv[])
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 1
constexpr index_t N = 1; constexpr index_t N = 1;
constexpr index_t C = 4; constexpr index_t C = 16;
constexpr index_t HI = 1080; constexpr index_t HI = 1080;
constexpr index_t WI = 1920; constexpr index_t WI = 1920;
constexpr index_t K = 16; constexpr index_t K = 16;
...@@ -630,12 +630,12 @@ int main(int argc, char* argv[]) ...@@ -630,12 +630,12 @@ int main(int argc, char* argv[])
print_array("ConvStrides", to_multi_index(ConvStrides{})); print_array("ConvStrides", to_multi_index(ConvStrides{}));
print_array("ConvDilations", to_multi_index(ConvDilations{})); print_array("ConvDilations", to_multi_index(ConvDilations{}));
#if 1 #if 0
using in_data_t = float; using in_data_t = float;
constexpr index_t in_vector_size = 1; constexpr index_t in_vector_size = 1;
using acc_data_t = float; using acc_data_t = float;
using out_data_t = float; using out_data_t = float;
#elif 1 #elif 0
using in_data_t = float; using in_data_t = float;
constexpr index_t in_vector_size = 1; constexpr index_t in_vector_size = 1;
using acc_data_t = float; using acc_data_t = float;
...@@ -741,7 +741,7 @@ int main(int argc, char* argv[]) ...@@ -741,7 +741,7 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk<in_data_t, device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk<in_data_t,
in_vector_size, in_vector_size,
acc_data_t, acc_data_t,
......
...@@ -3,7 +3,7 @@ rm -f CMakeCache.txt ...@@ -3,7 +3,7 @@ rm -f CMakeCache.txt
rm -f *.cmake rm -f *.cmake
rm -rf CMakeFiles rm -rf CMakeFiles
MY_PROJECT_SOURCE=../../../ MY_PROJECT_SOURCE=../
MY_PROJECT_INSTALL=../install.dir MY_PROJECT_INSTALL=../install.dir
cmake \ cmake \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment