Commit b6950a3c authored by Chao Liu's avatar Chao Liu
Browse files

debugging

parent 7b5b1c83
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include "threadwise_tensor_slice_transfer.hpp" #include "threadwise_tensor_slice_transfer.hpp"
#include "threadwise_tensor_slice_set.hpp" #include "threadwise_tensor_slice_set.hpp"
#define DEBUG_USE_C_SHUFFLE 0 #define DEBUG_USE_C_SHUFFLE 1
namespace ck { namespace ck {
...@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -290,12 +290,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
const auto M = c_grid_desc_m_n.GetLength(I0); const auto M = c_grid_desc_m_n.GetLength(I0);
const auto N = c_grid_desc_m_n.GetLength(I1); const auto N = c_grid_desc_m_n.GetLength(I1);
const auto MBlock = M / MPerBlock;
const auto NBlock = N / NPerBlock;
constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl); constexpr index_t MWave = MPerBlock / (MRepeat * MPerXdl);
constexpr index_t NWave = NPerBlock / (NRepeat * NPerXdl); constexpr index_t NWave = NPerBlock / (NRepeat * NPerXdl);
const index_t MBlock = M / (MWave * MPerXdl * MRepeat);
const index_t NBlock = N / (NWave * NPerXdl * NRepeat);
const auto c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl = const auto c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl =
transform_tensor_descriptor( transform_tensor_descriptor(
c_grid_desc_m_n, c_grid_desc_m_n,
...@@ -439,7 +439,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -439,7 +439,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
// A matrix blockwise copy // A matrix blockwise copy
auto a_blockwise_copy = auto a_blockwise_copy =
BlockwiseTensorSliceTransfer_v4<BlockSize, BlockwiseTensorSliceTransfer_v4<BlockSize,
#if 0
AElementwiseOperation, AElementwiseOperation,
#else
ck::tensor_operation::element_wise::PassThrough,
#endif
InMemoryDataOperationEnum_t::Set, InMemoryDataOperationEnum_t::Set,
Sequence<K0PerBlock, MPerBlock, K1>, Sequence<K0PerBlock, MPerBlock, K1>,
ABlockTransferThreadSliceLengths_K0_M_K1, ABlockTransferThreadSliceLengths_K0_M_K1,
...@@ -467,7 +471,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -467,7 +471,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
// B matrix blockwise copy // B matrix blockwise copy
auto b_blockwise_copy = auto b_blockwise_copy =
BlockwiseTensorSliceTransfer_v4<BlockSize, BlockwiseTensorSliceTransfer_v4<BlockSize,
#if 0
BElementwiseOperation, BElementwiseOperation,
#else
ck::tensor_operation::element_wise::PassThrough,
#endif
InMemoryDataOperationEnum_t::Set, InMemoryDataOperationEnum_t::Set,
Sequence<K0PerBlock, NPerBlock, K1>, Sequence<K0PerBlock, NPerBlock, K1>,
BBlockTransferThreadSliceLengths_K0_N_K1, BBlockTransferThreadSliceLengths_K0_N_K1,
...@@ -654,7 +662,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -654,7 +662,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
FloatC, FloatC,
decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2), decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
decltype(c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2), decltype(c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2),
#if 0
CElementwiseOperation, CElementwiseOperation,
#else
ck::tensor_operation::element_wise::PassThrough,
#endif
Sequence<M0, N0, I1, I1, M2, I1, M4, I1>, Sequence<M0, N0, I1, I1, M2, I1, M4, I1>,
CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim, CThreadTransferSrcDstVectorDim,
...@@ -739,7 +751,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -739,7 +751,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2), decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2), decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
Sequence<I1, I1, I1, I1, M2, M3, M4, N2>, Sequence<I1, I1, I1, I1, M2, I1, M4, I1>,
Sequence<0, 1, 2, 3, 4, 5, 6, 7>, Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
7, 7,
1, 1,
...@@ -758,8 +770,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -758,8 +770,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
ck::tensor_operation::element_wise::PassThrough{}}; ck::tensor_operation::element_wise::PassThrough{}};
// TODO: this is hardcoded, only works for BlockSize = 256. fix it! // TODO: this is hardcoded, only works for BlockSize = 256. fix it!
constexpr index_t MThread_CCopy = 16; constexpr index_t MThread_CCopy = 32;
constexpr index_t NThread_CCopy = 16; constexpr index_t NThread_CCopy = 8;
constexpr index_t MPerThread_CCopy = MPerBlock_CCopy / MThread_CCopy; constexpr index_t MPerThread_CCopy = MPerBlock_CCopy / MThread_CCopy;
constexpr index_t NPerThread_CCopy = NPerBlock_CCopy / NThread_CCopy; constexpr index_t NPerThread_CCopy = NPerBlock_CCopy / NThread_CCopy;
...@@ -768,6 +780,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -768,6 +780,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
make_naive_tensor_descriptor_packed(make_tuple( make_naive_tensor_descriptor_packed(make_tuple(
I1, I1, Number<MPerBlock_CCopy>{}, I1, I1, Number<NPerBlock_CCopy>{})); I1, I1, Number<MPerBlock_CCopy>{}, I1, I1, Number<NPerBlock_CCopy>{}));
static_assert(c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
.GetElementSpaceSize() == 64 * 64,
"wrong!");
auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>( auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
static_cast<FloatAcc*>(p_shared), static_cast<FloatAcc*>(p_shared),
c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl
...@@ -789,8 +805,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -789,8 +805,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
Sequence<0, 1, 2, 3, 4, 5>, // typename DstDimAccessOrder, Sequence<0, 1, 2, 3, 4, 5>, // typename DstDimAccessOrder,
5, // index_t SrcVectorDim, 5, // index_t SrcVectorDim,
5, // index_t DstVectorDim, 5, // index_t DstVectorDim,
MThread_CCopy, // index_t SrcScalarPerVector, NPerThread_CCopy, // index_t SrcScalarPerVector,
NThread_CCopy, // index_t DstScalarPerVector, NPerThread_CCopy, // index_t DstScalarPerVector,
1, // index_t SrcScalarStrideInVector, 1, // index_t SrcScalarStrideInVector,
1, // index_t DstScalarStrideInVector, 1, // index_t DstScalarStrideInVector,
true, // bool ThreadTransferSrcResetCoordinateAfterRun, true, // bool ThreadTransferSrcResetCoordinateAfterRun,
...@@ -827,6 +843,30 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -827,6 +843,30 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
c_block_buf); c_block_buf);
#if 0
if(get_thread_local_1d_id() == 0)
{
for(int mwave = 0; mwave < MWave; ++mwave)
{
for(int mperxdl = 0; mperxdl < MPerXdl; ++mperxdl)
{
for(int nwave = 0; nwave < NWave; ++nwave)
{
for(int nperxdl = 0; nperxdl < NPerXdl; ++nperxdl)
{
int m = mwave * MPerXdl + mperxdl;
int n = nwave * NPerXdl + nperxdl;
int offset = m * NWave * NPerXdl + n;
c_block_buf(offset) = 10 * mwave + nwave;
}
}
}
}
}
#endif
// make sure ds_write from c_thread_copy_vgpr_to_lds is completed // make sure ds_write from c_thread_copy_vgpr_to_lds is completed
block_sync_lds(); block_sync_lds();
...@@ -837,6 +877,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1 ...@@ -837,6 +877,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl, c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
c_grid_buf); c_grid_buf);
// make sure ds_read from c_block_copy_lds_to_global is completed
block_sync_lds();
// move on nrepeat dimension // move on nrepeat dimension
if constexpr(nrepeat_forward_sweep && (nrepeat < NRepeat - 1)) if constexpr(nrepeat_forward_sweep && (nrepeat < NRepeat - 1))
{ {
......
...@@ -166,7 +166,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 ...@@ -166,7 +166,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
index_t tmp = ordered_src_access_idx[I0]; index_t tmp = ordered_src_access_idx[I0];
// TODO: BUG: should start at 1 // TODO: BUG: should start at 1
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
}); });
...@@ -613,7 +613,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 ...@@ -613,7 +613,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2
index_t tmp = ordered_dst_access_lengths[I0] - 1; index_t tmp = ordered_dst_access_lengths[I0] - 1;
// TODO: BUG: should start at 1 // TODO: BUG: should start at 1
static_for<0, i, 1>{}([&](auto j) { static_for<1, i, 1>{}([&](auto j) {
tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
}); });
......
...@@ -198,6 +198,10 @@ int main(int argc, char* argv[]) ...@@ -198,6 +198,10 @@ int main(int argc, char* argv[])
{ {
case 0: break; case 0: break;
case 1: case 1:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
break;
case 2:
in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break; break;
...@@ -272,5 +276,15 @@ int main(int argc, char* argv[]) ...@@ -272,5 +276,15 @@ int main(int argc, char* argv[])
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
{
LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
<< std::endl;
}
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment