Commit 718cadca authored by root's avatar root
Browse files

debug trace deep tread level

parent 20e283dc
...@@ -1181,6 +1181,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1181,6 +1181,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
block_idx, iter_start, iter_end, is_sk_block, is_dp_block); block_idx, iter_start, iter_end, is_sk_block, is_dp_block);
} }
__syncthreads();
while(true) while(true)
{ {
uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
...@@ -1211,6 +1213,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1211,6 +1213,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
c_grid_desc_m_n, problem.MBlock, problem.NBlock); c_grid_desc_m_n, problem.MBlock, problem.NBlock);
// Emin @added
__syncthreads();
// Emin @debug // Emin @debug
// Debug: Print grid descriptor sizes // Debug: Print grid descriptor sizes
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
...@@ -1220,6 +1225,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1220,6 +1225,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
} }
// Emin @added
__syncthreads();
// Create dynamic buffers for A, B, C matrices in global memory // Create dynamic buffers for A, B, C matrices in global memory
auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>( auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
...@@ -1248,6 +1256,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1248,6 +1256,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
const index_t k0_block_data_idx_on_grid = const index_t k0_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(iter_offset * AK0Number); __builtin_amdgcn_readfirstlane(iter_offset * AK0Number);
// Emin @added
__syncthreads();
// Emin @debug // Emin @debug
// Debug: Print block data indices on grid // Debug: Print block data indices on grid
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
...@@ -1255,6 +1266,10 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1255,6 +1266,10 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid); m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid);
} }
// Emin @added
__syncthreads();
// lds max alignment // lds max alignment
constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number); constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
...@@ -1367,12 +1382,18 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1367,12 +1382,18 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
(a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
KPerBlock); KPerBlock);
// Emin @added
__syncthreads();
// Emin @debug // Emin @debug
// Debug: Print number of K blocks in main loop // Debug: Print number of K blocks in main loop
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Number of K Blocks in Main Loop: %d\n", num_k_block_main_loop); printf("Number of K Blocks in Main Loop: %d\n", num_k_block_main_loop);
} }
// Emin @added
__syncthreads();
blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>( blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
a_grid_desc_ak0_m_ak1, a_grid_desc_ak0_m_ak1,
a_block_desc_ak0_m_ak1, a_block_desc_ak0_m_ak1,
...@@ -1564,13 +1585,19 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1564,13 +1585,19 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// make sure it's safe to write to LDS // make sure it's safe to write to LDS
block_sync_lds(); block_sync_lds();
// Emin @added
__syncthreads();
// Emin @debug // Emin @debug
// Debug: Print before writing C to LDS // Debug: Print before writing C to LDS
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1570 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id)); printf("Gridwise_gemm_sk line 1594 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
} }
// Emin @added
__syncthreads();
// each thread write its data from VGPR to LDS // each thread write its data from VGPR to LDS
c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2, c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
sfc_c_vgpr.GetIndexTupleOfNumber(access_id), sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
...@@ -1598,10 +1625,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3 ...@@ -1598,10 +1625,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
} }
else if(is_sk_block) else if(is_sk_block)
{ {
// Emin @added
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1602 --is_sk_block !! each block copy data from LDS to global.\n"); printf("Gridwise_gemm_sk line 1602 --is_sk_block !! each block copy data from LDS to global.\n");
} }
// Emin @added
__syncthreads();
// each block copy its data from LDS to global // each block copy its data from LDS to global
c_shuffle_block_copy_lds_to_global c_shuffle_block_copy_lds_to_global
.template Run<decltype(c_shuffle_block_buf), .template Run<decltype(c_shuffle_block_buf),
......
...@@ -106,8 +106,8 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2 ...@@ -106,8 +106,8 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
// Debug: Print source vector data if valid // Debug: Print source vector data if valid
if (threadIdx.x == 0 && threadIdx.y == 0 && is_src_valid) { if (threadIdx.x == 0 && threadIdx.y == 0 && is_src_valid) {
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>()); // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>());
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{}))); printf("BlockId %d - Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{})); // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{}));
} }
// apply pointwise operation // apply pointwise operation
...@@ -123,25 +123,43 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2 ...@@ -123,25 +123,43 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
// printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v)); // printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
// } // }
// Emin @added
__syncthreads();
// Debug: Print SrcData before and after applying element-wise operation // Debug: Print SrcData before and after applying element-wise operation
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{}))); // printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n",static_cast<int>(idx_1d.value), static_cast<int>(i.value), src_vector_container.template AsType<SrcData>().At(Number<i>{})); // printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<int>(i.value), src_vector_container.template AsType<SrcData>().At(Number<i>{}));
// // printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
// printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu \n" , static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), v);
printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
// printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v)); // printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
printf("Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu \n", static_cast<int>(idx_1d.value) , static_cast<int>(i.value), v); printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %f \n" , static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), static_cast<float>(v));
} }
// Emin @added
__syncthreads();
// apply type convert // apply type convert
dst_vector_container.template AsType<DstData>()(i) = type_convert<DstData>(v); dst_vector_container.template AsType<DstData>()(i) = type_convert<DstData>(v);
// Emin @added
__syncthreads();
// Emin @debug // Emin @debug
// Debug: Print type conversion result // Debug: Print type conversion result
if (threadIdx.x == 0 && threadIdx.y == 0) { if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Threadwise_tensor slice v6r1r2 line 121 : Type Conversion Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>()[i])); // printf("Threadwise_tensor slice v6r1r2 line 121 : Type Conversion Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>()[i]));
// printf("DstData after type conversion at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>().At(Number<i>{}))); // printf("DstData after type conversion at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>().At(Number<i>{})));
printf("Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %hu \n", static_cast<int>(idx_1d.value) , static_cast<int>(i.value), dst_vector_container.template AsType<DstData>().At(Number<i>{})); // printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %hu \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), dst_vector_container.template AsType<DstData>().At(Number<i>{}));
printf("BlockId %d - Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value) , static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>().At(Number<i>{})));
} }
// Emin @added
__syncthreads();
}); });
const bool is_dst_valid = const bool is_dst_valid =
...@@ -153,9 +171,12 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2 ...@@ -153,9 +171,12 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
is_dst_valid, is_dst_valid,
dst_vector_container.template AsType<dst_vector_t>()[I0]); dst_vector_container.template AsType<dst_vector_t>()[I0]);
// // Debug: Print data before copying from dst_vector into dst_buf // // // // Debug: Print data before copying from dst_vector into dst_buf
// if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) { // if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) {
// printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<dst_vector_t>().At(I0)); // // printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<DstData>().At(I0));
// // printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
// printf("BlockId %d - Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
// } // }
// move coordinate // move coordinate
......
...@@ -15,17 +15,24 @@ else ...@@ -15,17 +15,24 @@ else
fi fi
cmake \ cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm-6.2.1/ \ -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm-6.2.1/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ -D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=ON \ -D BUILD_DEV=ON \
-D GPU_TARGETS=$GPU_TARGETS \ -D GPU_TARGETS=$GPU_TARGETS \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \ -D USE_BITINT_EXTENSION_INT4=OFF \
-D CK_LOGGING=ON \
$REST_ARGS \ $REST_ARGS \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
# -D CMAKE_PREFIX_PATH=/opt/rocm \ # -D CMAKE_PREFIX_PATH=/opt/rocm \
# -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc # -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc
# -D CMAKE_PREFIX_PATH=/opt/rocm-6.2.1/ \
# -D CMAKE_CXX_COMPILER=/opt/rocm-6.2.1/bin/hipcc
# -D CK_LOGGING=ON
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment