"...composable_kernel.git" did not exist on "f6934e0bf4460c7ad97c57d5f4a645e426048b1d"
Commit 11b848da authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed stride_division_ceil

parent 06810ad4
...@@ -49,7 +49,8 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, ...@@ -49,7 +49,8 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
constexpr index_t N1 = 2; constexpr index_t N1 = 2;
constexpr index_t N2 = 4; constexpr index_t N2 = 4;
constexpr index_t B = (N * (Ho/Strides::Get(I0)) * (Wo/Strides::Get(I1))) / (N1 * N2); constexpr index_t B = N * mod_conv::integer_divide_ceil(Ho, Strides::Get(I0)) *
mod_conv::integer_divide_ceil(Wo, Strides::Get(I1)) / (N1 * N2);
#if 1 #if 1
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
......
...@@ -25,8 +25,6 @@ struct GeneratorTensor_0 ...@@ -25,8 +25,6 @@ struct GeneratorTensor_0
} }
}; };
struct GeneratorTensor_1 struct GeneratorTensor_1
{ {
template <class... Is> template <class... Is>
...@@ -122,12 +120,12 @@ template <class TIn, ...@@ -122,12 +120,12 @@ template <class TIn,
class Strides, class Strides,
class Dilations> class Dilations>
void host_direct_convolution_forw(const Tensor<TIn>& in_nchw, void host_direct_convolution_forw(const Tensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx, const Tensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw, Tensor<TOut>& out_nkhw,
LowerPads, LowerPads,
UpperPads, UpperPads,
Strides, Strides,
Dilations) Dilations)
{ {
index_t h_pad_low = LowerPads{}.Get(Number<0>{}); index_t h_pad_low = LowerPads{}.Get(Number<0>{});
index_t w_pad_low = LowerPads{}.Get(Number<1>{}); index_t w_pad_low = LowerPads{}.Get(Number<1>{});
...@@ -179,13 +177,12 @@ template <class TIn, ...@@ -179,13 +177,12 @@ template <class TIn,
class Strides, class Strides,
class Dilations> class Dilations>
void host_direct_convolution_back(Tensor<TOut>& in_nchw, void host_direct_convolution_back(Tensor<TOut>& in_nchw,
const Tensor<TWei>& wei_kcyx, const Tensor<TWei>& wei_kcyx,
const Tensor<TIn>& out_nkhw, const Tensor<TIn>& out_nkhw,
LowerPads, LowerPads,
UpperPads, UpperPads,
Strides, Strides,
Dilations Dilations)
)
{ {
index_t h_pad_low = LowerPads{}.Get(Number<0>{}); index_t h_pad_low = LowerPads{}.Get(Number<0>{});
index_t w_pad_low = LowerPads{}.Get(Number<1>{}); index_t w_pad_low = LowerPads{}.Get(Number<1>{});
...@@ -199,22 +196,23 @@ void host_direct_convolution_back(Tensor<TOut>& in_nchw, ...@@ -199,22 +196,23 @@ void host_direct_convolution_back(Tensor<TOut>& in_nchw,
index_t dilation_h = Dilations{}.Get(Number<0>{}); index_t dilation_h = Dilations{}.Get(Number<0>{});
index_t dilation_w = Dilations{}.Get(Number<1>{}); index_t dilation_w = Dilations{}.Get(Number<1>{});
//loop n,c,hi,wi // loop n,c,hi,wi
auto f = [&](auto n, auto c, auto hi, auto wi) { auto f = [&](auto n, auto c, auto hi, auto wi) {
double v = 0; double v = 0;
//loop k,y,x // loop k,y,x
for(int k = 0; k < wei_kcyx.mDesc.GetLengths()[0]; ++k) for(int k = 0; k < wei_kcyx.mDesc.GetLengths()[0]; ++k)
{ {
for(int y = 0; y < wei_kcyx.mDesc.GetLengths()[2]; ++y) for(int y = 0; y < wei_kcyx.mDesc.GetLengths()[2]; ++y)
{ {
int ho_ = (hi - y * dilation_h + h_pad_low); int ho_ = (hi - y * dilation_h + h_pad_low);
int ho = ho_ / stride_h; int ho = ho_ / stride_h;
for(int x = 0; x < wei_kcyx.mDesc.GetLengths()[3]; ++x) for(int x = 0; x < wei_kcyx.mDesc.GetLengths()[3]; ++x)
{ {
int wo_ = (wi - x * dilation_w + w_pad_low); int wo_ = (wi - x * dilation_w + w_pad_low);
int wo = wo_ / stride_w; int wo = wo_ / stride_w;
if(ho >= 0 && ho < out_nkhw.mDesc.GetLengths()[2] && wo >= 0 && if(ho >= 0 && ho < out_nkhw.mDesc.GetLengths()[2] && wo >= 0 &&
wo < out_nkhw.mDesc.GetLengths()[3] && ho_ % stride_h == 0 && wo_ % stride_w == 0) wo < out_nkhw.mDesc.GetLengths()[3] && ho_ % stride_h == 0 &&
wo_ % stride_w == 0)
{ {
v += double(out_nkhw(n, k, ho, wo)) * double(wei_kcyx(k, c, y, x)); v += double(out_nkhw(n, k, ho, wo)) * double(wei_kcyx(k, c, y, x));
} }
...@@ -501,7 +499,7 @@ int main(int argc, char* argv[]) ...@@ -501,7 +499,7 @@ int main(int argc, char* argv[])
constexpr index_t HDilation = 1; constexpr index_t HDilation = 1;
constexpr index_t WDilation = 1; constexpr index_t WDilation = 1;
constexpr index_t Direction = 2; //1: Forward; 2:Backward constexpr index_t Direction = 1; // 1: Forward; 2:Backward
#if 0 #if 0
constexpr index_t N = 32; constexpr index_t N = 32;
constexpr index_t C = 128; constexpr index_t C = 128;
...@@ -553,8 +551,8 @@ int main(int argc, char* argv[]) ...@@ -553,8 +551,8 @@ int main(int argc, char* argv[])
// 1x1 filter, 28x28 image // 1x1 filter, 28x28 image
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 128;
constexpr index_t HI = 28; constexpr index_t HI = 7;
constexpr index_t WI = 28; constexpr index_t WI = 7;
constexpr index_t K = 128; constexpr index_t K = 128;
constexpr index_t Y = 1; constexpr index_t Y = 1;
constexpr index_t X = 1; constexpr index_t X = 1;
...@@ -716,8 +714,8 @@ int main(int argc, char* argv[]) ...@@ -716,8 +714,8 @@ int main(int argc, char* argv[])
#elif 0 #elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_0{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_0{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
//out_nkhw.GenerateTensorValue(GeneratorTensor_Checkboard{}, num_thread); // out_nkhw.GenerateTensorValue(GeneratorTensor_Checkboard{}, num_thread);
//out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); // out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
out_nkhw.GenerateTensorValue(GeneratorTensor_4{}, num_thread); out_nkhw.GenerateTensorValue(GeneratorTensor_4{}, num_thread);
#elif 0 #elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
...@@ -764,8 +762,7 @@ int main(int argc, char* argv[]) ...@@ -764,8 +762,7 @@ int main(int argc, char* argv[])
strides, strides,
dilations, dilations,
in_nchw_device, in_nchw_device,
nrepeat nrepeat);
);
#elif 1 #elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc, device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc,
......
...@@ -112,27 +112,28 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -112,27 +112,28 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// input tensor // input tensor
// tensor descriptor in device memory [N0, N1, N2, Ho, Wo] // tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
constexpr auto in_n0_n1_n2_h_w_global_desc = in_n_c_h_w_global_desc.Slice(I2, constexpr auto in_n0_n1_n2_h_w_global_desc =
Number<Ho/Strides::Get(I0)>{}) in_n_c_h_w_global_desc
.Slice(I3, Number<Wo/Strides::Get(I1)>{}) .Slice(I2, Number<mod_conv::integer_divide_ceil(Ho, Strides::Get(I0))>{})
.Fold(I0, Number<N1>{}, Number<N2>{}) .Slice(I3, Number<mod_conv::integer_divide_ceil(Wo, Strides::Get(I1))>{})
.Extract(Sequence<0, 1, 2, 4, 5>{}); .Fold(I0, Number<N1>{}, Number<N2>{})
.Extract(Sequence<0, 1, 2, 4, 5>{});
//constexpr auto in_n0_n1_n2_h_w_global_desc =
//in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{}) // constexpr auto in_n0_n1_n2_h_w_global_desc =
//.Extract(Sequence<0, 1, 2, 4, 5>{}); // in_n_c_h_w_global_desc.Fold(I0, Number<N1>{}, Number<N2>{})
//.Extract(Sequence<0, 1, 2, 4, 5>{});
//constexpr auto in_lengths_new = Sequence<N0, N1, N2, Ho, Wo>{};
// constexpr auto in_lengths_new = Sequence<N0, N1, N2, Ho, Wo>{};
//constexpr auto in_strides_new =
//Sequence<in_n0_n1_n2_h_w_global_desc.GetStride(I0), // constexpr auto in_strides_new =
//in_n0_n1_n2_h_w_global_desc.GetStride(I1), // Sequence<in_n0_n1_n2_h_w_global_desc.GetStride(I0),
//in_n0_n1_n2_h_w_global_desc.GetStride(I2), // in_n0_n1_n2_h_w_global_desc.GetStride(I1),
//in_n0_n1_n2_h_w_global_desc.GetStride(I3), // in_n0_n1_n2_h_w_global_desc.GetStride(I2),
//in_n0_n1_n2_h_w_global_desc.GetStride(I4)>{}; // in_n0_n1_n2_h_w_global_desc.GetStride(I3),
// in_n0_n1_n2_h_w_global_desc.GetStride(I4)>{};
//constexpr auto in_n0_n1_n2_h_w_new_global_desc =
//make_ConstantTensorDescriptor(in_lengths_new, in_strides_new); // constexpr auto in_n0_n1_n2_h_w_new_global_desc =
// make_ConstantTensorDescriptor(in_lengths_new, in_strides_new);
constexpr auto in_n0_n1_n2_h_w_new_global_desc = in_n0_n1_n2_h_w_global_desc; constexpr auto in_n0_n1_n2_h_w_new_global_desc = in_n0_n1_n2_h_w_global_desc;
// batch descritpor for device memory // batch descritpor for device memory
...@@ -140,17 +141,17 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -140,17 +141,17 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
constexpr auto in_c_y_x_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Y>{}) constexpr auto in_c_y_x_global_desc = in_n_c_h_w_global_desc.Slice(I2, Number<Y>{})
.Slice(I3, Number<X>{}) .Slice(I3, Number<X>{})
.Extract(Sequence<1, 2, 3>{}); .Extract(Sequence<1, 2, 3>{});
//constexpr auto in_win_lengths_new = Sequence<in_c_y_x_global_desc.GetLength(I0), // constexpr auto in_win_lengths_new = Sequence<in_c_y_x_global_desc.GetLength(I0),
//in_c_y_x_global_desc.GetLength(I1), // in_c_y_x_global_desc.GetLength(I1),
//in_c_y_x_global_desc.GetLength(I2)>{}; // in_c_y_x_global_desc.GetLength(I2)>{};
//constexpr auto in_win_strides_new = // constexpr auto in_win_strides_new =
//Sequence<in_c_y_x_global_desc.GetStride(I0), // Sequence<in_c_y_x_global_desc.GetStride(I0),
//in_c_y_x_global_desc.GetStride(I1), // in_c_y_x_global_desc.GetStride(I1),
//in_c_y_x_global_desc.GetStride(I2)>{}; // in_c_y_x_global_desc.GetStride(I2)>{};
//constexpr auto in_c_y_x_new_global_desc = // constexpr auto in_c_y_x_new_global_desc =
//make_ConstantTensorDescriptor(in_win_lengths_new, in_win_strides_new); // make_ConstantTensorDescriptor(in_win_lengths_new, in_win_strides_new);
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy // merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor( constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
...@@ -189,16 +190,14 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -189,16 +190,14 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
InBlockCopyDstDataPerWrite_N2>( InBlockCopyDstDataPerWrite_N2>(
{0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
// weight tensor // weight tensor
// tensor descriptor in device memory, src of blockwise copy // tensor descriptor in device memory, src of blockwise copy
#if 0 #if 0
constexpr auto wei_e_k_global_desc = constexpr auto wei_e_k_global_desc =
wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
#else #else
constexpr auto wei_e_k_global_desc = constexpr auto wei_e_k_global_desc = make_ConstantMergedTensorDescriptor(
make_ConstantMergedTensorDescriptor(wei_k_c_y_x_global_desc, wei_k_c_y_x_global_desc, Sequence<1, 2, 3>{}, Sequence<0>{});
Sequence<1, 2, 3>{},
Sequence<0>{});
#endif #endif
// tensor descriptor in LDS, dst of blockwise copy // tensor descriptor in LDS, dst of blockwise copy
...@@ -426,9 +425,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -426,9 +425,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I3), out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I3),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I4), out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I4),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I5), out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I5),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I6) / Strides{}.Get(I0), mod_conv::integer_divide_ceil(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I7) / Strides{}.Get(I1) out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I6), Strides{}.Get(I0)),
>{}; mod_conv::integer_divide_ceil(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetLength(I7), Strides{}.Get(I1))>{};
constexpr auto out_strides_new = Sequence< constexpr auto out_strides_new = Sequence<
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I0), out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I0),
...@@ -438,12 +438,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -438,12 +438,10 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I4), out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I4),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I5), out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I5),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I6) * Strides{}.Get(I0), out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I6) * Strides{}.Get(I0),
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I7) * Strides{}.Get(I1) out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.GetStride(I7) * Strides{}.Get(I1)>{};
>{};
constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_new_global_mem_desc = make_ConstantTensorDescriptor( constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_new_global_mem_desc =
out_lengths_new, out_strides_new make_ConstantTensorDescriptor(out_lengths_new, out_strides_new);
);
// calculate origin of thread output tensor on global memory // calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index // blockwise GEMM c matrix starting index
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment