Commit d891a596 authored by root's avatar root
Browse files

off some trace

parent dc3519ae
......@@ -21,6 +21,82 @@ using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// clang-format off
using DeviceGemmV2_Streamk_Instance =
ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
PassThrough, PassThrough, PassThrough, GemmDefault,
64, // Block Size
16, // MPer Block
16, // NPer Block
64, // KPer Block
8, // AK1
8, // BK1
16, // MPer XDL
16, // NPer XDL
1, // Mxdl Per Wave
1, // Nxdl Per Wave
S<8, 8, 1>, // AblockTransfer ThreadCluster Lenghts_K0_M_kK1
S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
2, // ABlockTransfer SrcVectorDim
8, // ABlockTransfer SrcScalar PerVector
8, // ABlockTransfer DstScalar PerVector_K1
0, // ABlockLds AddExtraM
S<8, 8, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
S<1, 0, 2>, // BlockTransfer SrcAccessOrder
2, // BBlockTransfer SrcVectorDim
8, // BBlockTransfer SrcScalar PerVector
8, // BBlockTransfer DstScalar PerVector_K1
0, // BBlocksLds AddExtraN
1, // CShuffle MXdlPerWave PerShuffle
1, // CShuffle NXdlPerWave PerShuffle
S<1, 16, 1, 4>, // CBlockTransferClusterLenghts _MBlock_MXdlPerWave_MWaveMPerXdl _NBlock_NXdlPerWave_NWaveNPerXdl
4, // CBlockTransfer ScalarPerVector _NWaveNPerXdl
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
// clang-format on
// // clang-format off
// using DeviceGemmV2_Streamk_Instance =
// ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
// ALayout, BLayout, CLayout,
// ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
// PassThrough, PassThrough, PassThrough, GemmDefault,
// 64, // Block Size
// 16, // MPer Block
// 16, // NPer Block
// 128, // KPer Block
// 8, // AK1
// 8, // BK1
// 16, // MPer XDL
// 16, // NPer XDL
// 1, // Mxdl Per Wave
// 1, // Nxdl Per Wave
// S<16, 4, 1>, // AblockTransfer ThreadCluster Lenghts_K0_M_kK1
// S<1, 0, 2>, // ABlockTransfer ThreadCluster ArrangeOrder
// S<1, 0, 2>, // ABlockTransfer SrcAccessOrder
// 2, // ABlockTransfer SrcVectorDim
// 8, // ABlockTransfer SrcScalar PerVector
// 8, // ABlockTransfer DstScalar PerVector_K1
// 0, // ABlockLds AddExtraM
// S<16, 4, 1>, // BBlockTransfer ThreadCluster Lengths_K0_N_K1
// S<1, 0, 2>, // BBlockTransfer ThreadCluster ArrangeOrder
// S<1, 0, 2>, // BlockTransfer SrcAccessOrder
// 2, // BBlockTransfer SrcVectorDim
// 8, // BBlockTransfer SrcScalar PerVector
// 8, // BBlockTransfer DstScalar PerVector_K1
// 0, // BBlocksLds AddExtraN
// 1, // CShuffle MXdlPerWave PerShuffle
// 1, // CShuffle NXdlPerWave PerShuffle
// S<1, 16, 1, 4>, // CBlockTransferClusterLenghts _MBlock_MXdlPerWave_MWaveMPerXdl _NBlock_NXdlPerWave_NWaveNPerXdl
// 4, // CBlockTransfer ScalarPerVector _NWaveNPerXdl
// ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
// // clang-format on
#if 0
// clang-format off
using DeviceGemmV2_Streamk_Instance =
ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
......@@ -40,6 +116,8 @@ using DeviceGemmV2_Streamk_Instance =
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
// clang-format on
#endif
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
......
......@@ -1149,13 +1149,15 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
bool is_sk_block, is_dp_block;
index_t num_k_block_main_loop;
#if 0
// Emin @debug
// Debug: Print initial problem size and grid configuration
// if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Gridwise_gemm_sk Line:1157 Problem M: %d, N: %d, K: %d, Grid Size: %d\n", problem.M, problem.N, problem.K, problem.Grid_size);
// }
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk Line:1157 Problem M: %d, N: %d, K: %d, Grid Size: %d\n", problem.M, problem.N, problem.K, problem.Grid_size);
}
#endif
for(auto block_idx = get_block_1d_id();
......@@ -1174,13 +1176,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
num_k_block_main_loop = iter_end - iter_start;
#if 1
// Emin @debug
// Debug: Print block information
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Block Index: %d, Iteration Start: %d, Iteration End: %d, Is Stream-K: %d, Is Data-Parallel: %d\n",
block_idx, iter_start, iter_end, is_sk_block, is_dp_block);
}
#endif
__syncthreads();
while(true)
......@@ -1216,6 +1219,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// Emin @added
__syncthreads();
#if 0
// Emin @debug
// Debug: Print grid descriptor sizes
if (threadIdx.x == 0 && threadIdx.y == 0) {
......@@ -1225,6 +1230,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
}
#endif
// Emin @added
__syncthreads();
......@@ -1259,6 +1266,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// Emin @added
__syncthreads();
#if 0
// Emin @debug
// Debug: Print block data indices on grid
if (threadIdx.x == 0 && threadIdx.y == 0) {
......@@ -1266,7 +1274,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid);
}
#endif
// Emin @added
__syncthreads();
......@@ -1363,15 +1371,16 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
constexpr auto b_block_slice_copy_step =
make_multi_index(KPerBlock / BK1Number, 0, 0);
#if 0
// Emin @debug
// Debug: Print shared memory buffer sizes for A and B
// if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Shared Memory Buffer Size - A: %d, B: %d\n",
// a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
// b_block_desc_bk0_n_bk1.GetElementSpaceSize());
// }
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Shared Memory Buffer Size - A: %d, B: %d\n",
a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
b_block_desc_bk0_n_bk1.GetElementSpaceSize());
}
#endif
// Blockwise GEMM pipeline
static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
......@@ -1528,6 +1537,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
n_thread_data_on_block_idx[I2]),
ck::tensor_operation::element_wise::PassThrough{}};
// Emin @Note : I traced this !! ThreadGroupTensorSliceTransfer_v6r1r2 -- > Threadwise_tensor_slice_transfer_v6r1r2 !!!!!!!
// shuffle: blockwise copy C from LDS to global
auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1r2<
ThisThreadBlock, // ThreadGroup
......@@ -1588,16 +1600,34 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// Emin @added
__syncthreads();
// Emin @Note : This prints always 0 !!!!!
#if 0
// Emin @debug
// Debug: Print before writing C to LDS
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1594 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
printf("Gridwise_gemm_sk line 1606 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
}
#endif
// Emin @added
__syncthreads();
// Emin @debug
// Debug: Print sfc_c_vgpr index tuple
#if 1
if (threadIdx.x == 0 && threadIdx.y == 0) {
auto index_tuple = sfc_c_vgpr.GetIndexTupleOfNumber(access_id);
printf("Gridwise_gemm_sk Debug line 1618--Block %d, Access %d: sfc_c_vgpr Index Tuple = (%d, %d, %d, ...).\n",
block_idx, static_cast<int>(access_id),
static_cast<int>(index_tuple.At(Number<0>{})),
static_cast<int>(index_tuple.At(Number<1>{})),
static_cast<int>(index_tuple.At(Number<2>{}))); // Adjust based on tuple size
}
#endif
// each thread write its data from VGPR to LDS
c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
......@@ -1628,9 +1658,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// Emin @added
__syncthreads();
#if 1
// Emin @debug
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1602 --is_sk_block !! each block copy data from LDS to global.\n");
printf("Gridwise_gemm_sk line 1662 --is_sk_block !! each block copy data from LDS to global #Start.\n");
}
#endif
// Emin @added
__syncthreads();
......@@ -1645,6 +1679,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
c_shuffle_block_buf,
c_grid_desc_mblock_mperblock_nblock_nperblock,
c_grid_buf);
#if 1
// Emin @debug
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1684 --is_sk_block !! each block copy data from LDS to global #End.\n");
}
#endif
}
if constexpr(access_id < num_access - 1)
......
......@@ -103,13 +103,13 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
auto dst_vector_container = dst_vector_type{};
// Emin @debug
// Debug: Print source vector data if valid
// Debug: Print source vector data if valid
if (threadIdx.x == 0 && threadIdx.y == 0 && is_src_valid) {
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>());
printf("BlockId %d - Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
printf("BlockId %d - Threadwise_tensor slice v6r1r2 line 109: Src Vector Data at idx %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{}));
}
// Emin@debug
// Emin @debug
// apply pointwise operation
static_for<0, ScalarPerVector, 1>{}([&](auto i) {
......@@ -120,14 +120,15 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
// Emin @debug
// Debug: Print element-wise operation result
// if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
// }
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
}
// Emin @added
__syncthreads();
// Emin @debug
#if 1
// Debug: Print SrcData before and after applying element-wise operation
if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
......@@ -139,7 +140,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
// printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %f \n" , static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), static_cast<float>(v));
}
#endif
// Emin @added
__syncthreads();
......@@ -171,14 +172,18 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
dst_coord_.GetOffset(),
is_dst_valid,
dst_vector_container.template AsType<dst_vector_t>()[I0]);
#if 0
// Emin @debug
// // Debug: Print data before copying from dst_vector into dst_buf
if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) {
// printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<DstData>().At(I0));
// printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
// // // // Debug: Print data before copying from dst_vector into dst_buf
// if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) {
// // printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<DstData>().At(I0));
// // printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
}
// printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
// }
#endif
// move coordinate
if constexpr(idx_1d.value != num_access - 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment