"...composable_kernel.git" did not exist on "abf4bdb9a9946c578d4801a79650e79938fb0e41"
Commit e131f6aa authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent f64fab12
...@@ -164,7 +164,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -164,7 +164,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
constexpr index_t KBlockWork = K / KPerBlock; constexpr index_t KBlockWork = K / KPerBlock;
constexpr index_t BBlockWork = B / BPerBlock; constexpr index_t BBlockWork = B / BPerBlock;
#if 0
constexpr auto block_work_desc = constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{}); make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{});
...@@ -172,15 +171,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -172,15 +171,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
const index_t k_block_data_on_global = block_work_id[0] * KPerBlock; const index_t k_block_data_on_global = block_work_id[0] * KPerBlock;
const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
#else
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<BBlockWork, KBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t b_block_data_on_global = block_work_id[0] * BPerBlock;
const index_t k_block_data_on_global = block_work_id[1] * KPerBlock;
#endif
// input tensor // input tensor
// global tensor in global memory // global tensor in global memory
......
...@@ -56,11 +56,6 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -56,11 +56,6 @@ struct BlockwiseGenericTensorSliceCopy_v4
constexpr auto thread_cluster_desc = constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
#if 0
static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
"wrong! BlockSize not consistent with ThreadClusterLengths");
#endif
const auto thread_cluster_id = const auto thread_cluster_id =
thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id()); thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id());
...@@ -88,7 +83,19 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -88,7 +83,19 @@ struct BlockwiseGenericTensorSliceCopy_v4
constexpr auto thread_cluster_desc = constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
if(get_thread_local_1d_id() < thread_cluster_desc.GetElementSize()) if(BlockSize == thread_cluster_desc.GetElementSize())
{
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseLoad.Run_optimized_src_address_calculation(p_block_src, p_thread_buffer);
}
else
{
mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
}
}
else if(get_thread_local_1d_id() < thread_cluster_desc.GetElementSize())
{ {
// TODO: threadwise copy is still being tweaked // TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation) if(has_optimized_address_calculation)
...@@ -112,7 +119,20 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -112,7 +119,20 @@ struct BlockwiseGenericTensorSliceCopy_v4
constexpr auto thread_cluster_desc = constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
if(get_thread_local_1d_id() < thread_cluster_desc.GetElementSize()) if(BlockSize == thread_cluster_desc.GetElementSize())
{
// TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation)
{
mThreadwiseStore.Run_optimized_dst_address_calculation(p_thread_buffer,
p_block_dst);
}
else
{
mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
}
}
else if(get_thread_local_1d_id() < thread_cluster_desc.GetElementSize())
{ {
// TODO: threadwise copy is still being tweaked // TODO: threadwise copy is still being tweaked
if(has_optimized_address_calculation) if(has_optimized_address_calculation)
......
...@@ -111,7 +111,6 @@ struct GridwiseGemmTransposedANormalBNormalC_v1 ...@@ -111,7 +111,6 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
constexpr index_t MBlockWork = M / MPerBlock; constexpr index_t MBlockWork = M / MPerBlock;
constexpr index_t NBlockWork = N / NPerBlock; constexpr index_t NBlockWork = N / NPerBlock;
#if 1
constexpr auto block_work_desc = constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<MBlockWork, NBlockWork>{}); make_cluster_descriptor(Sequence<MBlockWork, NBlockWork>{});
...@@ -119,15 +118,6 @@ struct GridwiseGemmTransposedANormalBNormalC_v1 ...@@ -119,15 +118,6 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
const index_t m_block_data_on_global = block_work_id[0] * MPerBlock; const index_t m_block_data_on_global = block_work_id[0] * MPerBlock;
const index_t n_block_data_on_global = block_work_id[1] * NPerBlock; const index_t n_block_data_on_global = block_work_id[1] * NPerBlock;
#else
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<NBlockWork, MBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t n_block_data_on_global = block_work_id[0] * NPerBlock;
const index_t m_block_data_on_global = block_work_id[1] * MPerBlock;
#endif
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment