Commit fd368ca6 authored by Jing Zhang's avatar Jing Zhang
Browse files

seperate c2

parent 7802381d
...@@ -26,6 +26,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 ...@@ -26,6 +26,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
struct MatrixIndex struct MatrixIndex
{ {
index_t k; index_t k;
index_t n;
index_t h; index_t h;
index_t w; index_t w;
}; };
...@@ -104,7 +105,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 ...@@ -104,7 +105,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
index_t h_thread_id = hw_thread_id / num_w_threads; index_t h_thread_id = hw_thread_id / num_w_threads;
index_t w_thread_id = hw_thread_id % num_w_threads; index_t w_thread_id = hw_thread_id % num_w_threads;
return MatrixIndex{k_thread_id, h_thread_id, w_thread_id}; return MatrixIndex{k_thread_id, 1, h_thread_id, w_thread_id};
} }
template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer> template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>
......
...@@ -352,8 +352,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -352,8 +352,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
index_t e0_block_data_begin = 0; index_t e0_block_data_begin = 0;
do // do
{ //{
// LDS double buffer: preload data // LDS double buffer: preload data
{ {
a_blockwise_copy.RunRead( a_blockwise_copy.RunRead(
...@@ -391,7 +391,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -391,7 +391,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3
b_e0_e1_n_ho_wo_e2_global_step_hacks); b_e0_e1_n_ho_wo_e2_global_step_hacks);
// LDS double buffer: GEMM on current data // LDS double buffer: GEMM on current data
// TODO: @Zhang Jing: blockwise gemm should be able to move slice window
blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0)); blockwise_gemm.MoveABlockSliceWindow(make_tuple(EPerBlock, 0, 0));
...@@ -443,18 +442,18 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -443,18 +442,18 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
} }
a_blockwise_copy.MoveSrcSliceWindow(a_e0_e1_k_e2_global_desc, // a_blockwise_copy.MoveSrcSliceWindow(a_e0_e1_k_e2_global_desc,
a_block_slice_copy_step, // a_block_slice_copy_step,
AGlobalMoveSliceWindowStepHacks{}); // AGlobalMoveSliceWindowStepHacks{});
blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0)); // blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0));
b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc, // b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
b_thread_slice_copy_step); // b_thread_slice_copy_step);
e0_block_data_begin += 1; // e0_block_data_begin += 1;
} while(e0_block_data_begin < E0); //} while(e0_block_data_begin < E0);
// output: register to global memory // output: register to global memory
{ {
......
...@@ -64,41 +64,98 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -64,41 +64,98 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
constexpr auto K = AThreadDesc_E1_K_E2{}.GetLength(I1); constexpr auto K = AThreadDesc_E1_K_E2{}.GetLength(I1);
constexpr auto E2 = AThreadDesc_E1_K_E2{}.GetLength(I2); constexpr auto E2 = AThreadDesc_E1_K_E2{}.GetLength(I2);
constexpr auto H = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I2); constexpr auto Ho = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
constexpr auto W = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I3); constexpr auto Wo = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
constexpr auto a_origin_idx = to_multi_index(AOriginIdx{}); constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
constexpr auto b_origin_idx = to_multi_index(BOriginIdx{}); constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
constexpr auto c_origin_idx = to_multi_index(COriginIdx{}); constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
constexpr index_t Vec = 2;
static_for<0, K, 1>{}([&](auto k) { static_for<0, K, 1>{}([&](auto k) {
static_for<0, H, 1>{}([&](auto h) {
static_for<0, W, 1>{}([&](auto w) {
static_for<0, E1, 1>{}([&](auto e) { static_for<0, E1, 1>{}([&](auto e) {
static_for<0, Ho, Vec>{}([&](auto h) {
static_for<0, Wo, Vec>{}([&](auto w) {
vector_type<FloatA, E2> a_vec; vector_type<FloatA, E2> a_vec;
vector_type<FloatB, E2> b_vec;
vector_type<FloatB, E2> b0_vec;
vector_type<FloatB, E2> b1_vec;
vector_type<FloatB, E2> b2_vec;
vector_type<FloatB, E2> b3_vec;
static_for<0, E2, 1>{}([&](auto v) { static_for<0, E2, 1>{}([&](auto v) {
constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset( constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
a_origin_idx + make_tuple(e, k, v)); a_origin_idx + make_tuple(e, k, v));
constexpr index_t b_offset =
constexpr index_t b0_offset =
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset( BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
b_origin_idx + make_tuple(e, 0, h, w, v)); b_origin_idx + make_tuple(e, 0, h, w, v));
constexpr index_t b1_offset =
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
b_origin_idx + make_tuple(e, 0, h, w + 1, v));
constexpr index_t b2_offset =
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
b_origin_idx + make_tuple(e, 0, h + 1, w, v));
constexpr index_t b3_offset =
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
b_origin_idx + make_tuple(e, 0, h + 1, w + 1, v));
a_vec.template AsType<FloatA>()(v) = a_buf[Number<a_offset>{}]; a_vec.template AsType<FloatA>()(v) = a_buf[Number<a_offset>{}];
b_vec.template AsType<FloatB>()(v) = b_buf[Number<b_offset>{}];
b0_vec.template AsType<FloatB>()(v) = b_buf[Number<b0_offset>{}];
b1_vec.template AsType<FloatB>()(v) = b_buf[Number<b1_offset>{}];
b2_vec.template AsType<FloatB>()(v) = b_buf[Number<b2_offset>{}];
b3_vec.template AsType<FloatB>()(v) = b_buf[Number<b3_offset>{}];
}); });
using a_vector_t = typename vector_type<FloatA, E2>::type; using a_vector_t = typename vector_type<FloatA, E2>::type;
using b_vector_t = typename vector_type<FloatB, E2>::type; using b_vector_t = typename vector_type<FloatB, E2>::type;
constexpr index_t c_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset( constexpr index_t c0_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
c_origin_idx + make_tuple(k, 0, h, w)); c_origin_idx + make_tuple(k, 0, h, w));
inner_product<a_vector_t, b_vector_t, FloatC>( constexpr index_t c1_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
a_vec.template AsType<a_vector_t>()[I0], c_origin_idx + make_tuple(k, 0, h, w + 1));
b_vec.template AsType<b_vector_t>()[I0],
c_buf(Number<c_offset>{})); constexpr index_t c2_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
c_origin_idx + make_tuple(k, 0, h + 1, w));
constexpr index_t c3_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
c_origin_idx + make_tuple(k, 0, h + 1, w + 1));
amd_assembly_outer_product_1x4(a_vec.template AsType<a_vector_t>()[I0],
b0_vec.template AsType<b_vector_t>()[I0],
b1_vec.template AsType<b_vector_t>()[I0],
b2_vec.template AsType<b_vector_t>()[I0],
b3_vec.template AsType<b_vector_t>()[I0],
c_buf(Number<c0_offset>{}),
c_buf(Number<c1_offset>{}),
c_buf(Number<c2_offset>{}),
c_buf(Number<c3_offset>{}));
// inner_product<a_vector_t, b_vector_t, FloatC>(
// a_vec.template AsType<a_vector_t>()[I0],
// b0_vec.template AsType<b_vector_t>()[I0],
// c_buf(Number<c0_offset>{}));
// inner_product<a_vector_t, b_vector_t, FloatC>(
// a_vec.template AsType<a_vector_t>()[I0],
// b1_vec.template AsType<b_vector_t>()[I0],
// c_buf(Number<c1_offset>{}));
// inner_product<a_vector_t, b_vector_t, FloatC>(
// a_vec.template AsType<a_vector_t>()[I0],
// b2_vec.template AsType<b_vector_t>()[I0],
// c_buf(Number<c2_offset>{}));
// inner_product<a_vector_t, b_vector_t, FloatC>(
// a_vec.template AsType<a_vector_t>()[I0],
// b3_vec.template AsType<b_vector_t>()[I0],
// c_buf(Number<c3_offset>{}));
}); });
}); });
}); });
......
...@@ -49,7 +49,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -49,7 +49,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
const auto Y = wei_k_c_y_x_lengths[I2]; const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3]; const auto X = wei_k_c_y_x_lengths[I3];
#if 0 #if 1
const auto C0 = C / Number<InWeiVectorSize>{}; const auto C0 = C / Number<InWeiVectorSize>{};
const auto C1 = Number<InWeiVectorSize>{}; const auto C1 = Number<InWeiVectorSize>{};
...@@ -105,17 +105,17 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -105,17 +105,17 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
constexpr index_t HoPerBlock = 8; constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 32; constexpr index_t WoPerBlock = 32;
constexpr index_t E1 = 4 * 9; constexpr index_t E1 = 2 * 9;
constexpr index_t E2 = 4; constexpr index_t E2 = 8;
constexpr index_t EPerBlock = 4; constexpr index_t EPerBlock = 2;
constexpr index_t KPerThread = KPerBlock; constexpr index_t KPerThread = KPerBlock;
constexpr index_t HoPerThread = 2; constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2; constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1; constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 9, 1, E2>; using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 9, 1, 8>;
using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, 4, 16, 1>; using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, EPerBlock, 16, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2; constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2; constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
......
...@@ -93,23 +93,29 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -93,23 +93,29 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW std::cerr << "InRightPadH = " << InRightPadH << " InRightPadW = " << InRightPadW
<< std::endl; << std::endl;
const auto E = C0 * Y * X * C1; const auto E = C0 * Y * X;
const auto E0 = E / (E1 * E2);
// static_assert(E % E1 == 0, "");
static_assert(E2 == C1, "");
const auto E0 = E / E1;
// weight tensor // weight tensor
const auto a_e_k_grid_desc = transform_tensor_descriptor( const auto a_e0_k_e2_grid_desc = transform_tensor_descriptor(
make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X * C1)), make_naive_tensor_descriptor_packed(make_tuple(K, C0 * Y * X, E2)),
make_tuple(make_pass_through_transform(K), make_tuple(make_pass_through_transform(K),
make_pass_through_transform(C0 * Y * X * C1)), make_pass_through_transform(C0 * Y * X),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_pass_through_transform(E2)),
make_tuple(Sequence<1>{}, Sequence<0>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}));
const auto a_e0_e1_k_e2_grid_desc = const auto a_e0_e1_k_e2_grid_desc =
transform_tensor_descriptor(a_e_k_grid_desc, transform_tensor_descriptor(a_e0_k_e2_grid_desc,
make_tuple(make_unmerge_transform(make_tuple(E0, E1, E2)), make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
make_pass_through_transform(K)), make_pass_through_transform(K),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_pass_through_transform(E2)),
make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
// input tensor // input tensor
const auto in_n_c0_hip_wip_c1_global_desc = transform_tensor_descriptor( const auto in_n_c0_hip_wip_c1_global_desc = transform_tensor_descriptor(
...@@ -118,7 +124,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -118,7 +124,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_pass_through_transform(C0), make_pass_through_transform(C0),
make_pad_transform(Hi, InLeftPadH, InRightPadH), make_pad_transform(Hi, InLeftPadH, InRightPadH),
make_pad_transform(Wi, InLeftPadW, InRightPadW), make_pad_transform(Wi, InLeftPadW, InRightPadW),
make_pass_through_transform(C1)), make_pass_through_transform(E2)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
...@@ -129,28 +135,32 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -129,28 +135,32 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_pass_through_transform(C0), make_pass_through_transform(C0),
make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)), make_embed_transform(make_tuple(Y, Hop), make_tuple(ConvDilationH, ConvStrideH)),
make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)), make_embed_transform(make_tuple(X, Wop), make_tuple(ConvDilationW, ConvStrideW)),
make_pass_through_transform(C1)), make_pass_through_transform(E2)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple( make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{})); Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6>{}));
const auto b_e_n_ho_wo_grid_desc = transform_tensor_descriptor( const auto b_e0_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
in_n_c0_y_ho_x_wo_c1_global_desc, in_n_c0_y_ho_x_wo_c1_global_desc,
make_tuple(make_merge_transform(make_tuple(C0, Y, X, C1)), make_tuple(make_merge_transform(make_tuple(C0, Y, X)),
make_pass_through_transform(N), make_pass_through_transform(N),
make_pass_through_transform(Hop), make_pass_through_transform(Hop),
make_pass_through_transform(Wop)), make_pass_through_transform(Wop),
make_tuple(Sequence<1, 2, 4, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}), make_pass_through_transform(E2)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); make_tuple(
Sequence<1, 2, 4>{}, Sequence<0>{}, Sequence<3>{}, Sequence<5>{}, Sequence<6>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor( const auto b_e0_e1_n_ho_wo_e2_grid_desc = transform_tensor_descriptor(
b_e_n_ho_wo_grid_desc, b_e0_n_ho_wo_e2_grid_desc,
make_tuple(make_unmerge_transform(make_tuple(E0, E1, E2)), make_tuple(make_unmerge_transform(make_tuple(E0, E1)),
make_pass_through_transform(N), make_pass_through_transform(N),
make_pass_through_transform(Hop), make_pass_through_transform(Hop),
make_pass_through_transform(Wop)), make_pass_through_transform(Wop),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_pass_through_transform(E2)),
make_tuple(Sequence<0, 1, 5>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
make_tuple(
Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{}));
// output tensor // output tensor
const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor( const auto c_k_n_hop_wop_grid_desc = transform_tensor_descriptor(
...@@ -165,40 +175,41 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -165,40 +175,41 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl; std::cerr << "Hop = " << Hop << " Wop = " << Wop << std::endl;
if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 && if(!((K % KPerBlock) == 0 && (Hop % HoPerBlock) == 0 && (Wop % WoPerBlock) == 0 &&
(E % EPerBlock) == 0)) (E1 % EPerBlock) == 0))
{ {
throw std::runtime_error("wrong! GEMM size no divisible"); throw std::runtime_error("wrong! GEMM size no divisible");
} }
// hack to control index calculation when iterating over a_k_m_global tensor // hack to control index calculation when iterating over a_k_m_global tensor
constexpr auto a_e0_e1_k_e2_global_step_hacks = constexpr auto a_e0_e1_k_e2_global_step_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}), Sequence<0, 0, 0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{}, make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0, 0, 0>{}));
constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack = Sequence<0, 0, 0, 0, 0>{}; constexpr auto a_e0_e1_k_e2_global_move_slice_window_step_hack =
Sequence<0, 0, 0, 0, 0, 0, 0>{};
constexpr auto b_e0_e1_n_ho_wo_e2_global_step_hacks = make_tuple( constexpr auto b_e0_e1_n_ho_wo_e2_global_step_hacks = make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{}, make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{}), Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{}, make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}));
constexpr auto b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack = constexpr auto b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{}; Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format // hack for NKHW format
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment