Commit 40694062 authored by Jing Zhang's avatar Jing Zhang
Browse files

add adaptors

parent 3e5e4cf7
...@@ -13,7 +13,8 @@ template <index_t BlockSize, ...@@ -13,7 +13,8 @@ template <index_t BlockSize,
typename ABlockDesc_E1_K_E2, typename ABlockDesc_E1_K_E2,
typename BBlockDesc_E1_N_Ho_Wo_E2, typename BBlockDesc_E1_N_Ho_Wo_E2,
typename CThreadDesc_K_N_Ho_Wo, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop> index_t EPerThreadLoop,
index_t KPerThreadLoop>
struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
{ {
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
...@@ -22,27 +23,21 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 ...@@ -22,27 +23,21 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
static constexpr auto I3 = Number<3>{}; static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{}; static constexpr auto I4 = Number<4>{};
struct MatrixIndex using AIndex = MultiIndex<3>;
{ using BIndex = MultiIndex<3>;
index_t k; using CIndex = MultiIndex<4>;
index_t n;
index_t h;
index_t w;
};
static constexpr auto E1 = ABlockDesc_E1_K_E2{}.GetLength(I0); static constexpr auto E1 = ABlockDesc_E1_K_E2{}.GetLength(I0);
static constexpr auto K = ABlockDesc_E1_K_E2{}.GetLength(I1); static constexpr auto KPerBlock = ABlockDesc_E1_K_E2{}.GetLength(I1);
static constexpr auto E2 = ABlockDesc_E1_K_E2{}.GetLength(I2); static constexpr auto E2 = ABlockDesc_E1_K_E2{}.GetLength(I2);
static constexpr auto H = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2); static constexpr auto HPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
static constexpr auto W = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3); static constexpr auto WPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
static constexpr auto KPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I0); static constexpr auto KPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I0);
static constexpr auto HPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I2); static constexpr auto HPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I2);
static constexpr auto WPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I3); static constexpr auto WPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I3);
static constexpr index_t KPerThreadLoop = 2;
static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed( static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed(
make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadLoop>{}, Number<E2>{})); make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadLoop>{}, Number<E2>{}));
...@@ -57,8 +52,8 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 ...@@ -57,8 +52,8 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
Number<KPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{})); Number<KPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
__device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3() __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
: c_thread_begin_mtx_idx_{GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id())}, : c_thread_origin_data_idx_{GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id())},
a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread, 0)} a_thread_copy_{make_tuple(0, c_thread_origin_data_idx_[I0] * KPerThread, 0)}
{ {
static_assert(ABlockDesc_E1_K_E2::IsKnownAtCompileTime() && static_assert(ABlockDesc_E1_K_E2::IsKnownAtCompileTime() &&
BBlockDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() && BBlockDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
...@@ -73,12 +68,13 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 ...@@ -73,12 +68,13 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
static_assert(E1 % EPerThreadLoop == 0, ""); static_assert(E1 % EPerThreadLoop == 0, "");
static_assert(KPerThread % KPerThreadLoop == 0, ""); static_assert(KPerThread % KPerThreadLoop == 0, "");
static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0, static_assert(KPerBlock % KPerThread == 0 && HPerBlock % HPerThread == 0 &&
WPerBlock % WPerThread == 0,
"wrong! Cannot evenly divide work among\n"); "wrong! Cannot evenly divide work among\n");
constexpr auto KThreadCluster = K / KPerThread; constexpr auto KThreadCluster = KPerBlock / KPerThread;
constexpr auto HThreadCluster = H / HPerThread; constexpr auto HThreadCluster = HPerBlock / HPerThread;
constexpr auto WThreadCluster = W / WPerThread; constexpr auto WThreadCluster = WPerBlock / WPerThread;
static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster, static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
"wrong! wrong blocksize\n"); "wrong! wrong blocksize\n");
...@@ -86,25 +82,27 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 ...@@ -86,25 +82,27 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
__device__ static constexpr auto GetCThreadDesc_K_N_Ho_WoLengths() __device__ static constexpr auto GetCThreadDesc_K_N_Ho_WoLengths()
{ {
return Sequence<KPerThread, 1, HPerThread, WPerThread>{}; return Sequence<KPerThread, I1, HPerThread, WPerThread>{};
} }
__device__ static MatrixIndex GetBeginOfCThreadDesc_K_N_Ho_Wo(index_t thread_id) __device__ static CIndex GetBeginOfCThreadDesc_K_N_Ho_Wo(index_t thread_id)
{ {
constexpr index_t HPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2); constexpr auto K0 = KPerBlock / KPerThread;
constexpr index_t WPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3); constexpr auto N0 = I1;
constexpr auto H0 = HPerBlock / HPerThread;
constexpr auto num_w_threads = WPerBlock / WPerThread; constexpr auto W0 = WPerBlock / WPerThread;
constexpr auto num_h_threads = HPerBlock / HPerThread;
constexpr auto num_hw_threads = num_w_threads * num_h_threads; constexpr auto c_threadid_to_k_n_h_w_thread_cluster_adaptor =
make_single_stage_tensor_adaptor(
index_t k_thread_id = thread_id / num_hw_threads; make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))),
index_t hw_thread_id = thread_id % num_hw_threads; make_tuple(Sequence<0, 1, 2, 3>{}),
make_tuple(Sequence<0>{}));
index_t h_thread_id = hw_thread_id / num_w_threads;
index_t w_thread_id = hw_thread_id % num_w_threads; const auto c_k_n_h_w_thread_cluster_idx =
c_threadid_to_k_n_h_w_thread_cluster_adaptor.CalculateBottomIndex(
return MatrixIndex{k_thread_id, 1, h_thread_id, w_thread_id}; make_multi_index(thread_id));
return c_k_n_h_w_thread_cluster_idx;
} }
template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer> template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>
...@@ -157,8 +155,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 ...@@ -157,8 +155,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
} }
private: private:
MatrixIndex c_thread_begin_mtx_idx_;
using AThreadCopy = using AThreadCopy =
ThreadwiseTensorSliceTransfer_v4<FloatA, ThreadwiseTensorSliceTransfer_v4<FloatA,
FloatA, FloatA,
...@@ -170,6 +166,8 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 ...@@ -170,6 +166,8 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
E2, E2,
E2>; E2>;
CIndex c_thread_origin_data_idx_;
AThreadCopy a_thread_copy_; AThreadCopy a_thread_copy_;
}; };
......
...@@ -19,9 +19,7 @@ template <typename GridwiseGemm, ...@@ -19,9 +19,7 @@ template <typename GridwiseGemm,
typename BGridDesc_E0_E1_N_Ho_Wo_E2, typename BGridDesc_E0_E1_N_Ho_Wo_E2,
typename CGridDesc_K_N_Ho_Wo, typename CGridDesc_K_N_Ho_Wo,
typename CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo, typename CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo,
bool HasMainE0BlockLoop, bool HasMainE0BlockLoop>
bool HasMainE1BlockLoop,
bool HasDoubleTailE1BlockLoop>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -47,9 +45,8 @@ __global__ void ...@@ -47,9 +45,8 @@ __global__ void
a_e0_e1_k_e2_grid_desc, a_e0_e1_k_e2_grid_desc,
b_e0_e1_n_ho_wo_e2_grid_desc, b_e0_e1_n_ho_wo_e2_grid_desc,
c_k_n_ho_wo_grid_desc, c_k_n_ho_wo_grid_desc,
integral_constant<bool, HasMainE0BlockLoop>{}, c_blockid_to_k_n_ho_wo_block_cluster_adaptor,
integral_constant<bool, HasMainE1BlockLoop>{}, integral_constant<bool, HasMainE0BlockLoop>{});
integral_constant<bool, HasDoubleTailE1BlockLoop>{});
} }
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER #elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by CONSTANT void pointer // pass tensor descriptor by CONSTANT void pointer
...@@ -62,9 +59,7 @@ template <typename GridwiseGemm, ...@@ -62,9 +59,7 @@ template <typename GridwiseGemm,
typename BGridDesc_E0_E1_N_Ho_Wo_E2, typename BGridDesc_E0_E1_N_Ho_Wo_E2,
typename CGridDesc_K_N_Ho_Wo, typename CGridDesc_K_N_Ho_Wo,
typename CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo, typename CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo,
bool HasMainE0BlockLoop, bool HasMainE0BlockLoop>
bool HasMainE1BlockLoop,
bool HasDoubleTailE1BlockLoop>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -86,6 +81,9 @@ __global__ void ...@@ -86,6 +81,9 @@ __global__ void
cast_pointer_to_generic_address_space(p_b_e0_e1_n_ho_wo_e2_grid_desc)); cast_pointer_to_generic_address_space(p_b_e0_e1_n_ho_wo_e2_grid_desc));
const auto c_k_n_ho_wo_grid_desc = *reinterpret_cast<const CGridDesc_K_N_Ho_Wo*>( const auto c_k_n_ho_wo_grid_desc = *reinterpret_cast<const CGridDesc_K_N_Ho_Wo*>(
cast_pointer_to_generic_address_space(p_c_k_n_ho_wo_grid_desc)); cast_pointer_to_generic_address_space(p_c_k_n_ho_wo_grid_desc));
const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor =
*reinterpret_cast<const CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo*>(
cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_ho_wo_block_cluster_adaptor));
constexpr index_t shared_block_size = constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
...@@ -99,9 +97,8 @@ __global__ void ...@@ -99,9 +97,8 @@ __global__ void
a_e0_e1_k_e2_grid_desc, a_e0_e1_k_e2_grid_desc,
b_e0_e1_n_ho_wo_e2_grid_desc, b_e0_e1_n_ho_wo_e2_grid_desc,
c_k_n_ho_wo_grid_desc, c_k_n_ho_wo_grid_desc,
integral_constant<bool, HasMainE0BlockLoop>{}, c_blockid_to_k_n_ho_wo_block_cluster_adaptor,
integral_constant<bool, HasMainE1BlockLoop>{}, integral_constant<bool, HasMainE0BlockLoop>{});
integral_constant<bool, HasDoubleTailE1BlockLoop>{});
} }
#endif #endif
...@@ -110,9 +107,9 @@ template <index_t BlockSize, ...@@ -110,9 +107,9 @@ template <index_t BlockSize,
typename FloatAcc, typename FloatAcc,
typename FloatC, typename FloatC,
InMemoryDataOperationEnum_t CGlobalMemoryDataOperation, InMemoryDataOperationEnum_t CGlobalMemoryDataOperation,
typename AGlobalDesc_E0_E1_K_E2, typename AGridDesc_E0_E1_K_E2,
typename BGlobalDesc_E0_E1_N_Ho_Wo_E2, typename BGridDesc_E0_E1_N_Ho_Wo_E2,
typename CGlobalDesc_K_N_Ho_Wo, typename CGridDesc_K_N_Ho_Wo,
index_t E1_, index_t E1_,
index_t E2_, index_t E2_,
index_t KPerBlock, index_t KPerBlock,
...@@ -155,6 +152,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -155,6 +152,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
static constexpr auto E1 = Number<E1_>{}; static constexpr auto E1 = Number<E1_>{};
static constexpr auto E2 = Number<E2_>{}; static constexpr auto E2 = Number<E2_>{};
static constexpr auto NPerBlock = I1;
static constexpr auto K2 = 2;
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{ {
...@@ -172,17 +171,143 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -172,17 +171,143 @@ struct GridwiseGemmDlops_km_kn_mn_v3
return a_block_space_size * sizeof(FloatAB); return a_block_space_size * sizeof(FloatAB);
} }
template <bool HasMainE0BlockLoop, bool HasMainE1BlockLoop, bool HasDoubleTailE1BlockLoop> __host__ __device__ static constexpr index_t
__device__ static void Run(const FloatAB* __restrict__ p_a_global, CalculateGridSize(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
{
const auto K = c_k_n_ho_wo_grid_desc.GetLength(I0);
const auto N = c_k_n_ho_wo_grid_desc.GetLength(I1);
const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
const auto K0 = K / KPerBlock;
const auto N0 = N / NPerBlock;
const auto Ho_0 = Ho / HoPerBlock;
const auto Wo_0 = Wo / WoPerBlock;
const index_t grid_size = K0 * N0 * Ho_0 * Wo_0;
return grid_size;
}
__host__ __device__ static constexpr bool CalculateHasMainE0BlockLoop(const index_t E0)
{
const bool has_main_e0_block_loop = E0 > 1;
return has_main_e0_block_loop;
}
__host__ __device__ static constexpr bool CalculateHasMainE1BlockLoop()
{
const bool has_main_e1_block_loop = (E1 + E1PerBlock) / (2 * E1PerBlock) > 1;
return has_main_e1_block_loop;
}
__host__ __device__ static constexpr bool CalculateHasDoubleTailE1BlockLoop()
{
const bool has_double_tail_e1_block_loop = (E1 / E1PerBlock) % 2 == 0;
return has_double_tail_e1_block_loop;
}
__host__ __device__ static constexpr auto
MakeAE0E1K0K1E2GridDescriptor(const AGridDesc_E0_E1_K_E2& a_e0_e1_k_e2_grid_desc)
{
const auto E0 = a_e0_e1_k_e2_grid_desc.GetLength(I0);
// const auto E1 = a_e0_e1_k_e2_grid_desc.GetLength(I1);
const auto K = a_e0_e1_k_e2_grid_desc.GetLength(I2);
// const auto E2 = a_e0_e1_k_e2_grid_desc.GetLength(I3);
const auto K1 = Number<KPerBlock>{};
const auto K0 = K / K1;
const auto a_e0_e1_k0_k1_e2_grid_desc = transform_tensor_descriptor(
a_e0_e1_k_e2_grid_desc,
make_tuple(make_pass_through_transform(E0),
make_pass_through_transform(E1),
make_unmerge_transform(make_tuple(K0, K1)),
make_pass_through_transform(E2)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
return a_e0_e1_k0_k1_e2_grid_desc;
}
__host__ __device__ static constexpr auto MakeBE0E1NH0H1W0W1E2GridDescriptor(
const BGridDesc_E0_E1_N_Ho_Wo_E2& b_e0_e1_n_ho_wo_e2_grid_desc)
{
const auto E0 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I0);
// const auto E1 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I1);
const auto N = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I2);
const auto Ho = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I3);
const auto Wo = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I4);
// const auto E2 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I5);
const auto H1 = Number<HoPerBlock>{};
const auto H0 = Ho / H1;
const auto W1 = Number<WoPerBlock>{};
const auto W0 = Wo / W1;
const auto b_e0_e1_n_h0_h1_w0_w1_e2_grid_desc =
transform_tensor_descriptor(b_e0_e1_n_ho_wo_e2_grid_desc,
make_tuple(make_pass_through_transform(E0),
make_pass_through_transform(E1),
make_pass_through_transform(N),
make_unmerge_transform(make_tuple(H0, H1)),
make_unmerge_transform(make_tuple(W0, W1)),
make_pass_through_transform(E2)),
make_tuple(Sequence<0>{},
Sequence<1>{},
Sequence<2>{},
Sequence<3>{},
Sequence<4>{},
Sequence<5>{}),
make_tuple(Sequence<0>{},
Sequence<1>{},
Sequence<2>{},
Sequence<3, 4>{},
Sequence<5, 6>{},
Sequence<7>{}));
return b_e0_e1_n_h0_h1_w0_w1_e2_grid_desc;
}
__host__ __device__ static constexpr auto
MakeCBlockIdToKNHoWoBlockClusterAdaptor(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
{
const auto K = c_k_n_ho_wo_grid_desc.GetLength(I0);
const auto N = c_k_n_ho_wo_grid_desc.GetLength(I1);
const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
const auto K0 = K / KPerBlock;
const auto N0 = N / NPerBlock;
const auto Ho_0 = Ho / HoPerBlock;
const auto Wo_0 = Wo / WoPerBlock;
const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor = make_single_stage_tensor_adaptor(
make_tuple(make_merge_transform(make_tuple(K0, N0, Ho_0, Wo_0))),
make_tuple(Sequence<0, 1, 2, 3>{}),
make_tuple(Sequence<0>{}));
return c_blockid_to_k_n_ho_wo_block_cluster_adaptor;
}
using CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo =
decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{}));
template <bool HasMainE0BlockLoop>
__device__ static void
Run(const FloatAB* __restrict__ p_a_global,
const FloatAB* __restrict__ p_b_global, const FloatAB* __restrict__ p_b_global,
FloatC* __restrict__ p_c_global, FloatC* __restrict__ p_c_global,
FloatAB* __restrict__ p_shared_block, FloatAB* __restrict__ p_shared_block,
const AGlobalDesc_E0_E1_K_E2& a_e0_e1_k_e2_global_desc, const AGridDesc_E0_E1_K_E2& a_e0_e1_k_e2_global_desc,
const BGlobalDesc_E0_E1_N_Ho_Wo_E2& b_e0_e1_n_ho_wo_e2_global_desc, const BGridDesc_E0_E1_N_Ho_Wo_E2& b_e0_e1_n_ho_wo_e2_global_desc,
const CGlobalDesc_K_N_Ho_Wo& c_k_n_ho_wo_global_desc, const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_global_desc,
integral_constant<bool, HasMainE0BlockLoop>, const CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo& c_blockid_to_k_n_ho_wo_block_cluster_adaptor,
integral_constant<bool, HasMainE1BlockLoop>, integral_constant<bool, HasMainE0BlockLoop>)
integral_constant<bool, HasDoubleTailE1BlockLoop>)
{ {
const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_a_global, a_e0_e1_k_e2_global_desc.GetElementSpaceSize()); p_a_global, a_e0_e1_k_e2_global_desc.GetElementSpaceSize());
...@@ -191,40 +316,24 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -191,40 +316,24 @@ struct GridwiseGemmDlops_km_kn_mn_v3
auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize()); p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
static_assert(E1 % E1PerBlock == 0, ""); constexpr auto HasMainE1BlockLoop = CalculateHasMainE1BlockLoop();
constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
// const auto E = a_e0_e1_k_e2_global_desc.GetLength(I0);
// const auto K = a_e0_e1_k_e2_global_desc.GetLength(I1);
// const auto N = b_e0_e1_n_ho_wo_e2_global_desc.GetLength(I1); // const auto Ho = b_e0_e1_n_ho_wo_e2_global_desc.GetLength(I3);
const auto Ho = b_e0_e1_n_ho_wo_e2_global_desc.GetLength(I3); // const auto Wo = b_e0_e1_n_ho_wo_e2_global_desc.GetLength(I4);
const auto Wo = b_e0_e1_n_ho_wo_e2_global_desc.GetLength(I4);
// divide block work by [M, N] const auto c_k_n_ho_wo_block_cluster_idx =
#if 0 c_blockid_to_k_n_ho_wo_block_cluster_adaptor.CalculateBottomIndex(
const auto ho_block_work_num = Ho / Number<HoPerBlock>{}; make_multi_index(get_block_1d_id()));
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
const index_t k_block_work_id = get_block_1d_id() / hwo_block_work_num;
const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
#else
// Hack: this force result into SGPR
const index_t ho_block_work_num = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
const index_t wo_block_work_num = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
const index_t k_block_work_id = const index_t k_block_work_id =
__builtin_amdgcn_readfirstlane(get_block_1d_id() / hwo_block_work_num); __builtin_amdgcn_readfirstlane(c_k_n_ho_wo_block_cluster_idx[I0]);
const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num; const index_t n_block_work_id =
__builtin_amdgcn_readfirstlane(c_k_n_ho_wo_block_cluster_idx[I1]);
const index_t ho_block_work_id = const index_t ho_block_work_id =
__builtin_amdgcn_readfirstlane(hwo_block_work_id / wo_block_work_num); __builtin_amdgcn_readfirstlane(c_k_n_ho_wo_block_cluster_idx[I2]);
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num; const index_t wo_block_work_id =
#endif __builtin_amdgcn_readfirstlane(c_k_n_ho_wo_block_cluster_idx[I3]);
// lds max alignment // lds max alignment
constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{}; constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
...@@ -259,16 +368,18 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -259,16 +368,18 @@ struct GridwiseGemmDlops_km_kn_mn_v3
decltype(a_e1_k_e2_block_desc), decltype(a_e1_k_e2_block_desc),
decltype(b_e1_n_ho_wo_e2_block_desc), decltype(b_e1_n_ho_wo_e2_block_desc),
decltype(c_k_n_ho_wo_thread_desc), decltype(c_k_n_ho_wo_thread_desc),
EPerThread>{}; EPerThread,
K2>{};
auto c_thread_mtx_index = auto c_thread_mtx_index =
blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id()); blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id());
const auto k_thread_id = c_thread_mtx_index.k; const auto k_thread_id = c_thread_mtx_index[I0];
const auto ho_thread_id = c_thread_mtx_index.h; const auto ho_thread_id = c_thread_mtx_index[I2];
const auto wo_thread_id = c_thread_mtx_index.w; const auto wo_thread_id = c_thread_mtx_index[I3];
const index_t k_block_data_on_global = k_block_work_id * KPerBlock; const index_t k_block_data_on_global = k_block_work_id * KPerBlock;
const index_t n_block_data_on_global = n_block_work_id * HoPerBlock;
const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock; const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock; const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
...@@ -320,7 +431,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -320,7 +431,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
FloatAB, FloatAB,
decltype(b_e0_e1_n_ho_wo_e2_global_desc), decltype(b_e0_e1_n_ho_wo_e2_global_desc),
decltype(b_e0_e1_n_ho_wo_e2_thread_desc), decltype(b_e0_e1_n_ho_wo_e2_thread_desc),
Sequence<I1, E1PerBlock, 1, HoPerThread, WoPerThread, E2>, Sequence<I1, E1PerBlock, NPerBlock, HoPerThread, WoPerThread, E2>,
BBlockTransferSrcAccessOrder, BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim, BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector,
...@@ -346,7 +457,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -346,7 +457,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// initialize output thread tensor // initialize output thread tensor
ThreadwiseTensorSliceSet_v1<FloatAcc, ThreadwiseTensorSliceSet_v1<FloatAcc,
decltype(c_k_n_ho_wo_thread_desc), decltype(c_k_n_ho_wo_thread_desc),
Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{} Sequence<KPerThread, NPerBlock, HoPerThread, WoPerThread>>{}
.Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0}); .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
constexpr auto b_thread_slice_copy_step = make_multi_index(0, E1PerBlock, 0, 0, 0, 0); constexpr auto b_thread_slice_copy_step = make_multi_index(0, E1PerBlock, 0, 0, 0, 0);
......
...@@ -71,8 +71,9 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -71,8 +71,9 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
constexpr auto b_origin_idx = to_multi_index(BOriginIdx{}); constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
constexpr auto c_origin_idx = to_multi_index(COriginIdx{}); constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
#if 1 if constexpr((Ho % 2 == 0) && (Wo % 2 == 0))
constexpr index_t SubHW = 2; {
constexpr auto SubHW = 2;
static_for<0, K, 1>{}([&](auto k) { static_for<0, K, 1>{}([&](auto k) {
static_for<0, Ho, SubHW>{}([&](auto h) { static_for<0, Ho, SubHW>{}([&](auto h) {
...@@ -98,16 +99,20 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -98,16 +99,20 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset( BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
b_origin_idx + make_tuple(e1, 0, h + 1, w + 1, e2)); b_origin_idx + make_tuple(e1, 0, h + 1, w + 1, e2));
constexpr index_t c0_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset( constexpr index_t c0_offset =
c_origin_idx + make_tuple(k, 0, h, w)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
make_tuple(k, 0, h, w));
constexpr index_t c1_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset( constexpr index_t c1_offset =
CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
c_origin_idx + make_tuple(k, 0, h, w + 1)); c_origin_idx + make_tuple(k, 0, h, w + 1));
constexpr index_t c2_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset( constexpr index_t c2_offset =
CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
c_origin_idx + make_tuple(k, 0, h + 1, w)); c_origin_idx + make_tuple(k, 0, h + 1, w));
constexpr index_t c3_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset( constexpr index_t c3_offset =
CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
c_origin_idx + make_tuple(k, 0, h + 1, w + 1)); c_origin_idx + make_tuple(k, 0, h + 1, w + 1));
amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}], amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
...@@ -124,7 +129,10 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -124,7 +129,10 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
}); });
}); });
}); });
#else }
else
{
static_for<0, K, 1>{}([&](auto k) { static_for<0, K, 1>{}([&](auto k) {
static_for<0, Ho, 1>{}([&](auto h) { static_for<0, Ho, 1>{}([&](auto h) {
static_for<0, Wo, 1>{}([&](auto w) { static_for<0, Wo, 1>{}([&](auto w) {
...@@ -137,8 +145,9 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -137,8 +145,9 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset( BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
b_origin_idx + make_tuple(e1, 0, h, w, e2)); b_origin_idx + make_tuple(e1, 0, h, w, e2));
constexpr index_t c_offset = CThreadDesc_K_N_Ho_Wo{}.CalculateOffset( constexpr index_t c_offset =
c_origin_idx + make_tuple(k, 0, h, w)); CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
make_tuple(k, 0, h, w));
inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}], inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
b_buf[Number<b_offset>{}], b_buf[Number<b_offset>{}],
...@@ -148,7 +157,7 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 ...@@ -148,7 +157,7 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
}); });
}); });
}); });
#endif }
} }
}; };
......
...@@ -122,9 +122,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw( ...@@ -122,9 +122,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
constexpr index_t HoPerBlock = 8; constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 32; constexpr index_t WoPerBlock = 32;
constexpr index_t E1 = C0 * 9; constexpr index_t E1 = 2 * 9;
constexpr index_t E2 = 1; constexpr index_t E2 = 1;
constexpr index_t EPerBlock = C0; constexpr index_t EPerBlock = 2;
constexpr index_t KPerThread = 16; constexpr index_t KPerThread = 16;
constexpr index_t HoPerThread = 2; constexpr index_t HoPerThread = 2;
......
...@@ -77,8 +77,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -77,8 +77,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
const auto ConvDilationH = conv_dilations[I0]; const auto ConvDilationH = conv_dilations[I0];
const auto ConvDilationW = conv_dilations[I1]; const auto ConvDilationW = conv_dilations[I1];
const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{}; // const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{}; // const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
const auto OutRightPadH = Hop - Ho; const auto OutRightPadH = Hop - Ho;
const auto OutRightPadW = Wop - Wo; const auto OutRightPadW = Wop - Wo;
...@@ -226,9 +229,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -226,9 +229,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0>{}));
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), ""); // static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_ho_wo_e2_grid_desc.IsKnownAtCompileTime(), ""); // static_assert(b_e0_e1_n_ho_wo_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), ""); // static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), "");
// GEMM // GEMM
using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3< using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v3<
...@@ -280,18 +283,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -280,18 +283,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N; const auto grid_size = (K / KPerBlock) * (Hop / HoPerBlock) * (Wop / WoPerBlock) * N;
const bool has_main_e0_block_loop = E0 > 1; const bool has_main_e0_block_loop = E0 > 1;
const bool has_main_e1_block_loop = (E1 + E1PerBlock) / (2 * E1PerBlock) > 1;
const bool has_double_tail_e1_block_loop = (E1 / E1PerBlock) % 2 == 0;
std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
<< "has_main_e1_block_loop = " << has_main_e1_block_loop
<< " has_double_tail_e1_block_loop = " << has_double_tail_e1_block_loop
<< std::endl;
const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor = const auto c_blockid_to_k_n_ho_wo_block_cluster_adaptor =
make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(I0, I0))), GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
make_tuple(Sequence<0, 1>{}),
make_tuple(Sequence<0>{}));
using CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo = using CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo =
decltype(c_blockid_to_k_n_ho_wo_block_cluster_adaptor); decltype(c_blockid_to_k_n_ho_wo_block_cluster_adaptor);
...@@ -337,6 +333,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -337,6 +333,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.ToDevice( c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.ToDevice(
&c_blockid_to_k_n_ho_wo_block_cluster_adaptor); &c_blockid_to_k_n_ho_wo_block_cluster_adaptor);
if(has_main_e0_block_loop)
{
const auto kernel = const auto kernel =
kernel_gemm_dlops_v2<GridwiseGemm, kernel_gemm_dlops_v2<GridwiseGemm,
FloatAB, FloatAB,
...@@ -345,9 +344,38 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -345,9 +344,38 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
remove_reference_t<BGridDesc_E0_E1_N_Ho_Wo_E2>, remove_reference_t<BGridDesc_E0_E1_N_Ho_Wo_E2>,
remove_reference_t<CGridDesc_K_N_Ho_Wo>, remove_reference_t<CGridDesc_K_N_Ho_Wo>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo>,
has_main_e0_block_loop, true>;
has_main_e1_block_loop,
has_double_tail_e1_block_loop>; ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(
a_e0_e1_k_e2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
b_e0_e1_n_ho_wo_e2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_k_n_hop_wop_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
else
{
const auto kernel =
kernel_gemm_dlops_v2<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AGridDesc_E0_E1_K_E2>,
remove_reference_t<BGridDesc_E0_E1_N_Ho_Wo_E2>,
remove_reference_t<CGridDesc_K_N_Ho_Wo>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo>,
false>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -366,6 +394,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp ...@@ -366,6 +394,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
c_k_n_hop_wop_grid_desc_dev_buf.GetDeviceBuffer()), c_k_n_hop_wop_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.GetDeviceBuffer())); c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
#endif #endif
return ave_time; return ave_time;
} }
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE 0 #define USE_DYNAMIC_MODE 1
#define USE_CONV_FWD_V4R4_NCHW 0 #define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC 0 #define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0 #define USE_CONV_FWD_V6R1_NCHW 0
...@@ -52,6 +52,8 @@ int main(int argc, char* argv[]) ...@@ -52,6 +52,8 @@ int main(int argc, char* argv[])
constexpr auto I5 = Number<5>{}; constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{}; constexpr auto I6 = Number<6>{};
constexpr index_t activ_type = 0;
#if USE_DYNAMIC_MODE #if USE_DYNAMIC_MODE
// dynamic mode // dynamic mode
if(argc != 22) if(argc != 22)
...@@ -105,8 +107,6 @@ int main(int argc, char* argv[]) ...@@ -105,8 +107,6 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[5]); const bool do_log = std::stoi(argv[5]);
const int nrepeat = std::stoi(argv[6]); const int nrepeat = std::stoi(argv[6]);
constexpr index_t activ_type = 0;
#if 1 #if 1
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto C = Number<16>{}; constexpr auto C = Number<16>{};
......
...@@ -29,7 +29,7 @@ INIT=$4 ...@@ -29,7 +29,7 @@ INIT=$4
LOG=$5 LOG=$5
REPEAT=$6 REPEAT=$6
./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT #./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT
################################################ layout algo verify init log repeat N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads ################################################ layout algo verify init log repeat N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1 #./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
...@@ -53,7 +53,7 @@ REPEAT=$6 ...@@ -53,7 +53,7 @@ REPEAT=$6
#./host/driver_online/conv_fwd_driver_online $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1 #./host/driver_online/conv_fwd_driver_online $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1 16 16 3 3 1080 1920 1 1 1 1 1 1 1 1 ./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1 16 16 3 3 1080 1920 1 1 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1 16 16 1 1 16 16 1 1 1 1 0 0 0 0 #./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1 16 16 1 1 16 16 1 1 1 1 0 0 0 0
################################################ layout algo verify init log repeat M___ N___ K___ ################################################ layout algo verify init log repeat M___ N___ K___
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment