Commit d891a596 authored by root's avatar root
Browse files

off some trace

parent dc3519ae
...@@ -21,6 +21,82 @@ using CElementOp = PassThrough; ...@@ -21,6 +21,82 @@ using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// clang-format off
using DeviceGemmV2_Streamk_Instance =
ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
PassThrough, PassThrough, PassThrough, GemmDefault,
64, // Block Size
16, // MPer Block
16, // NPer Block
64, // KPer Block
8, // AK1
8, // BK1
16, // MPer XDL
16, // NPer XDL
1, // Mxdl Per Wave
1, // Nxdl Per Wave
S<8, 8, 1>, // AblockTransfer ThreadCluster Lenghts_K0_M_kK1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
8, // ABlockTransfer SrcScalar PerVector
8, // ABlockTransfer DstScalar PerVector_K1
0, // ABlockLds AddExtraM
S<8, 8, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
8, // BBlockTransfer SrcScalar PerVector
8, // BBlockTransfer DstScalar PerVector_K1
0, // BBlocksLds AddExtraN
1, // CShuffle MXdlPerWave PerShuffle
1, // CShuffle NXdlPerWave PerShuffle
S<1, 16, 1, 4>, // CBlockTransferClusterLenghts _MBlock_MXdlPerWave_MWaveMPerXdl _NBlock_NXdlPerWave_NWaveNPerXdl
4, // CBlockTransfer ScalarPerVector _NWaveNPerXdl
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
// clang-format on
// // clang-format off
// using DeviceGemmV2_Streamk_Instance =
// ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
// ALayout, BLayout, CLayout,
// ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
// PassThrough, PassThrough, PassThrough, GemmDefault,
// 64, // Block Size
// 16, // MPer Block
// 16, // NPer Block
// 128, // KPer Block
// 8, // AK1
// 8, // BK1
// 16, // MPer XDL
// 16, // NPer XDL
// 1, // Mxdl Per Wave
// 1, // Nxdl Per Wave
// S<16, 4, 1>, // AblockTransfer ThreadCluster Lenghts_K0_M_kK1
// S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
// S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
// 2, // ABlockTransfer SrcVectorDim
// 8, // ABlockTransfer SrcScalar PerVector
// 8, // ABlockTransfer DstScalar PerVector_K1
// 0, // ABlockLds AddExtraM
// S<16, 4, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
// S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
// S<1, 0, 2>, // BlockTransfer SrcAccessOrder
// 2, // BBlockTransfer SrcVectorDim
// 8, // BBlockTransfer SrcScalar PerVector
// 8, // BBlockTransfer DstScalar PerVector_K1
// 0, // BBlocksLds AddExtraN
// 1, // CShuffle MXdlPerWave PerShuffle
// 1, // CShuffle NXdlPerWave PerShuffle
// S<1, 16, 1, 4>, // CBlockTransferClusterLenghts _MBlock_MXdlPerWave_MWaveMPerXdl _NBlock_NXdlPerWave_NWaveNPerXdl
// 4, // CBlockTransfer ScalarPerVector _NWaveNPerXdl
// ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
// // clang-format on
#if 0
// clang-format off // clang-format off
using DeviceGemmV2_Streamk_Instance = using DeviceGemmV2_Streamk_Instance =
ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3< ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
...@@ -40,6 +116,8 @@ using DeviceGemmV2_Streamk_Instance = ...@@ -40,6 +116,8 @@ using DeviceGemmV2_Streamk_Instance =
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>; ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
// clang-format on // clang-format on
#endif
using ReferenceGemmInstance = ck::tensor_operation::host:: using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
......
...@@ -1149,13 +1149,15 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1149,13 +1149,15 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
bool is_sk_block, is_dp_block; bool is_sk_block, is_dp_block;
index_t num_k_block_main_loop; index_t num_k_block_main_loop;
#if 0
// Emin @debug // Emin @debug
// Debug: Print initial problem size and grid configuration // Debug: Print initial problem size and grid configuration
// if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Gridwise_gemm_sk Line:1157 Problem M: %d, N: %d, K: %d, Grid Size: %d\n", problem.M, problem.N, problem.K, problem.Grid_size); printf("Gridwise_gemm_sk Line:1157 Problem M: %d, N: %d, K: %d, Grid Size: %d\n", problem.M, problem.N, problem.K, problem.Grid_size);
// } }
#endif
for(auto block_idx = get_block_1d_id(); for(auto block_idx = get_block_1d_id();
...@@ -1174,13 +1176,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1174,13 +1176,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end); block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
num_k_block_main_loop = iter_end - iter_start; num_k_block_main_loop = iter_end - iter_start;
#if 1
// Emin @debug // Emin @debug
// Debug: Print block information // Debug: Print block information
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Block Index: %d, Iteration Start: %d, Iteration End: %d, Is Stream-K: %d, Is Data-Parallel: %d\n", printf("Block Index: %d, Iteration Start: %d, Iteration End: %d, Is Stream-K: %d, Is Data-Parallel: %d\n",
block_idx, iter_start, iter_end, is_sk_block, is_dp_block); block_idx, iter_start, iter_end, is_sk_block, is_dp_block);
} }
#endif
__syncthreads(); __syncthreads();
while(true) while(true)
...@@ -1216,6 +1219,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1216,6 +1219,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
#if 0
// Emin @debug // Emin @debug
// Debug: Print grid descriptor sizes // Debug: Print grid descriptor sizes
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
...@@ -1225,6 +1230,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1225,6 +1230,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
} }
#endif
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
...@@ -1259,6 +1266,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1259,6 +1266,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
#if 0
// Emin @debug // Emin @debug
// Debug: Print block data indices on grid // Debug: Print block data indices on grid
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
...@@ -1266,7 +1274,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1266,7 +1274,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid); m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid);
} }
#endif
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
...@@ -1363,15 +1371,16 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1363,15 +1371,16 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
constexpr auto b_block_slice_copy_step = constexpr auto b_block_slice_copy_step =
make_multi_index(KPerBlock / BK1Number, 0, 0); make_multi_index(KPerBlock / BK1Number, 0, 0);
#if 0
// Emin @debug // Emin @debug
// Debug: Print shared memory buffer sizes for A and B // Debug: Print shared memory buffer sizes for A and B
// if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Shared Memory Buffer Size - A: %d, B: %d\n", printf("Shared Memory Buffer Size - A: %d, B: %d\n",
// a_block_desc_ak0_m_ak1.GetElementSpaceSize(), a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
// b_block_desc_bk0_n_bk1.GetElementSpaceSize()); b_block_desc_bk0_n_bk1.GetElementSpaceSize());
// } }
#endif
// Blockwise GEMM pipeline // Blockwise GEMM pipeline
static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>); static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
...@@ -1528,6 +1537,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1528,6 +1537,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
n_thread_data_on_block_idx[I2]), n_thread_data_on_block_idx[I2]),
ck::tensor_operation::element_wise::PassThrough{}}; ck::tensor_operation::element_wise::PassThrough{}};
// Emin @Note : I traced this !! ThreadGroupTensorSliceTransfer_v6r1r2 -- > Threadwise_tensor_slice_transfer_v6r1r2 !!!!!!!
// shuffle: blockwise copy C from LDS to global // shuffle: blockwise copy C from LDS to global
auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1r2< auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1r2<
ThisThreadBlock, // ThreadGroup ThisThreadBlock, // ThreadGroup
...@@ -1588,16 +1600,34 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1588,16 +1600,34 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
// Emin @Note : This prints always 0 !!!!!
#if 0
// Emin @debug // Emin @debug
// Debug: Print before writing C to LDS // Debug: Print before writing C to LDS
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1594 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id)); printf("Gridwise_gemm_sk line 1606 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
} }
#endif
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
// Emin @debug
// Debug: Print sfc_c_vgpr index tuple
#if 1
if (threadIdx.x == 0 && threadIdx.y == 0) {
auto index_tuple = sfc_c_vgpr.GetIndexTupleOfNumber(access_id);
printf("Gridwise_gemm_sk Debug line 1618--Block %d, Access %d: sfc_c_vgpr Index Tuple = (%d, %d, %d, ...).\n",
block_idx, static_cast<int>(access_id),
static_cast<int>(index_tuple.At(Number<0>{})),
static_cast<int>(index_tuple.At(Number<1>{})),
static_cast<int>(index_tuple.At(Number<2>{}))); // Adjust based on tuple size
}
#endif
// each thread write its data from VGPR to LDS // each thread write its data from VGPR to LDS
c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2, c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
sfc_c_vgpr.GetIndexTupleOfNumber(access_id), sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
...@@ -1628,9 +1658,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1628,9 +1658,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
#if 1
// Emin @debug
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1602 --is_sk_block !! each block copy data from LDS to global.\n"); printf("Gridwise_gemm_sk line 1662 --is_sk_block !! each block copy data from LDS to global #Start.\n");
} }
#endif
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
...@@ -1645,6 +1679,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1645,6 +1679,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
c_shuffle_block_buf, c_shuffle_block_buf,
c_grid_desc_mblock_mperblock_nblock_nperblock, c_grid_desc_mblock_mperblock_nblock_nperblock,
c_grid_buf); c_grid_buf);
#if 1
// Emin @debug
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1684 --is_sk_block !! each block copy data from LDS to global #End.\n");
}
#endif
} }
if constexpr(access_id < num_access - 1) if constexpr(access_id < num_access - 1)
......
...@@ -103,13 +103,13 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2 ...@@ -103,13 +103,13 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
auto dst_vector_container = dst_vector_type{}; auto dst_vector_container = dst_vector_type{};
// Emin @debug // Emin @debug
// Debug: Print source vector data if valid // Debug: Print source vector data if valid
if (threadIdx.x == 0 && threadIdx.y == 0 && is_src_valid) { if (threadIdx.x == 0 && threadIdx.y == 0 && is_src_valid) {
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>()); // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>());
printf("BlockId %d - Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{}))); printf("BlockId %d - Threadwise_tensor slice v6r1r2 line 109: Src Vector Data at idx %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{})); // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{}));
} }
// Emin@debug // Emin @debug
// apply pointwise operation // apply pointwise operation
static_for<0, ScalarPerVector, 1>{}([&](auto i) { static_for<0, ScalarPerVector, 1>{}([&](auto i) {
...@@ -120,14 +120,15 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2 ...@@ -120,14 +120,15 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
// Emin @debug // Emin @debug
// Debug: Print element-wise operation result // Debug: Print element-wise operation result
// if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v)); printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
// } }
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
// Emin @debug
#if 1
// Debug: Print SrcData before and after applying element-wise operation // Debug: Print SrcData before and after applying element-wise operation
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{}))); // printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
...@@ -139,7 +140,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2 ...@@ -139,7 +140,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
// printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v)); // printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %f \n" , static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), static_cast<float>(v)); printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %f \n" , static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), static_cast<float>(v));
} }
#endif
// Emin @added // Emin @added
__syncthreads(); __syncthreads();
...@@ -171,14 +172,18 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2 ...@@ -171,14 +172,18 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
dst_coord_.GetOffset(), dst_coord_.GetOffset(),
is_dst_valid, is_dst_valid,
dst_vector_container.template AsType<dst_vector_t>()[I0]); dst_vector_container.template AsType<dst_vector_t>()[I0]);
#if 0
// Emin @debug
// // Debug: Print data before copying from dst_vector into dst_buf
if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) {
// printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<DstData>().At(I0));
// printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
// // // // Debug: Print data before copying from dst_vector into dst_buf printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
// if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) { }
// // printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<DstData>().At(I0));
// // printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
// printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid)); #endif
// }
// move coordinate // move coordinate
if constexpr(idx_1d.value != num_access - 1) if constexpr(idx_1d.value != num_access - 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment