Commit be8e0766 authored by root's avatar root
Browse files

trace copy

parent 6f210155
......@@ -10,10 +10,14 @@ struct StreamConfig
{
hipStream_t stream_id_ = nullptr;
bool time_kernel_ = false;
int log_level_ = 0;
int cold_niters_ = 5;
int nrepeat_ = 50;
int log_level_ = 1;
int cold_niters_ = 1;
int nrepeat_ = 1;
bool flush_cache = false;
int rotating_count = 1;
};
// #Emin
// cold_niters= 5
// nrepeat_ = 50
......@@ -10,6 +10,7 @@
#include <limits>
#include <stdlib.h>
namespace ck {
// Rows of column-vectors
......@@ -1180,29 +1181,8 @@ struct BlockToCTileMap_GemmStreamK
eqav_tiles_little = MDiv(upper_little / k_iters_per_tile.get());
}
#if 0
printf("cu:%d, occupancy:%d, grids:%d, num_tiles:%d, dp_tiles:%d, sk_num_big_blocks:%d, "
"sk_num_blocks:%d, "
"sk_total_iters:%d, dp_start_block_idx:%d, dp_iters_per_block:%d, dp_num_blocks:%d, "
"k_iters_per_tile:%d, k_iters_per_big_block:%d, reduction_start_block_idx:%u, "
"sk_tiles:%u, workspace(acc float):%u\n",
num_cu,
occupancy,
get_grid_dims().x,
num_tiles,
dp_tiles,
sk_num_big_blocks,
sk_num_blocks,
sk_total_iters,
dp_start_block_idx,
dp_iters_per_block,
dp_num_blocks,
k_iters_per_tile.get(),
k_iters_per_big_block,
reduction_start_block_idx,
get_sk_tiles(),
get_workspace_size(sizeof(float)));
#endif
}
__host__ __device__ uint32_t get_sk_total_iters() const
......@@ -1517,6 +1497,30 @@ struct BlockToCTileMap_GemmStreamK_v2
equiv_tiles_big = MDiv(upper_big / k_iters_per_tile.get());
equiv_tiles_little = MDiv(upper_little / k_iters_per_tile.get());
}
#if 0
printf("grid_size:%d, num_tiles:%d, dp_tiles:%d, sk_num_big_blocks:%d, "
"sk_num_blocks:%d, "
"sk_total_iters:%d, dp_start_block_idx:%d,dp_num_blocks:%d, "
"k_iters_per_tile:%d, k_iters_per_big_block:%d, reduction_start_block_idx:%u, "
"sk_tiles:%u, workspace(acc float):%u\n",
grid_size,
num_tiles,
dp_tiles,
sk_num_big_blocks,
sk_num_blocks,
sk_total_iters,
dp_start_block_idx,
dp_num_blocks,
k_iters_per_tile.get(),
k_iters_per_big_block,
reduction_start_block_idx,
get_sk_tiles(),
get_workspace_size(sizeof(float)));
#endif
}
__host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
......
......@@ -1065,6 +1065,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
}
}
// #Emin# Need to make this code comment out or remove
if constexpr(is_same<remove_cvref_t<CDataType>, bhalf_t>::value)
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
......@@ -1148,11 +1149,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
bool is_sk_block, is_dp_block;
index_t num_k_block_main_loop;
// Emin @debug
// Debug: Print initial problem size and grid configuration
// if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Gridwise_gemm_sk Line:1157 Problem M: %d, N: %d, K: %d, Grid Size: %d\n", problem.M, problem.N, problem.K, problem.Grid_size);
// }
for(auto block_idx = get_block_1d_id();
block_idx < block_2_ctile_map_streamk.get_grid_dims();
block_idx += gridDim.x)
{
// Determine if the block is stream-k or data-parallel
is_sk_block =
static_cast<uint32_t>(block_idx) < block_2_ctile_map_streamk.sk_num_blocks;
is_dp_block =
......@@ -1160,9 +1170,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
static_cast<uint32_t>(block_idx) <
block_2_ctile_map_streamk.reduction_start_block_idx;
// Get the iteration range for the current block
block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
num_k_block_main_loop = iter_end - iter_start;
// Emin @debug
// Debug: Print block information
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Block Index: %d, Iteration Start: %d, Iteration End: %d, Is Stream-K: %d, Is Data-Parallel: %d\n",
block_idx, iter_start, iter_end, is_sk_block, is_dp_block);
}
while(true)
{
uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
......@@ -1173,6 +1191,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
iter_end - 1, tile_idx, iter_offset);
iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);
// Create grid descriptors for A, B, C matrices
const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M,
problem.MPadded,
problem.K,
......@@ -1191,6 +1210,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
c_grid_desc_m_n, problem.MBlock, problem.NBlock);
// Emin @debug
// Debug: Print grid descriptor sizes
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("A Grid Desc Size: %ld, B Grid Desc Size: %ld, C Grid Desc Size: %ld\n",
a_grid_desc_ak0_m_ak1.GetElementSpaceSize(),
b_grid_desc_bk0_n_bk1.GetElementSpaceSize(),
c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
}
// Create dynamic buffers for A, B, C matrices in global memory
auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
......@@ -1200,6 +1230,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
// Map tile index to spatial coordinates (M, N)
auto block_work_idx =
block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N);
......@@ -1216,9 +1248,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
const index_t k0_block_data_idx_on_grid =
__builtin_amdgcn_readfirstlane(iter_offset * AK0Number);
// Emin @debug
// Debug: Print block data indices on grid
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("M Block Data Index on Grid: %d, N Block Data Index on Grid: %d, K0 Block Data Index: %d\n",
m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid);
}
// lds max alignment
constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
// Create shared memory buffers for A and B matrices in LDS
// A matrix in LDS memory, dst of blockwise copy
constexpr auto a_block_desc_ak0_m_ak1 =
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
......@@ -1308,6 +1348,16 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
constexpr auto b_block_slice_copy_step =
make_multi_index(KPerBlock / BK1Number, 0, 0);
// Emin @debug
// Debug: Print shared memory buffer sizes for A and B
// if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Shared Memory Buffer Size - A: %d, B: %d\n",
// a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
// b_block_desc_bk0_n_bk1.GetElementSpaceSize());
// }
// Blockwise GEMM pipeline
static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
......@@ -1317,6 +1367,12 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
(a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
KPerBlock);
// Emin @debug
// Debug: Print number of K blocks in main loop
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Number of K Blocks in Main Loop: %d\n", num_k_block_main_loop);
}
blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
a_grid_desc_ak0_m_ak1,
a_block_desc_ak0_m_ak1,
......@@ -1508,6 +1564,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
// make sure it's safe to write to LDS
block_sync_lds();
// Emin @debug
// Debug: Print before writing C to LDS
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1570 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
}
// each thread write its data from VGPR to LDS
c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
......@@ -1535,6 +1598,10 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
}
else if(is_sk_block)
{
if (threadIdx.x == 0 && threadIdx.y == 0) {
printf("Gridwise_gemm_sk line 1602 --is_sk_block !! each block copy data from LDS to global.\n");
}
// each block copy its data from LDS to global
c_shuffle_block_copy_lds_to_global
.template Run<decltype(c_shuffle_block_buf),
......
......@@ -102,6 +102,14 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
auto dst_vector_container = dst_vector_type{};
// Emin @debug
// Debug: Print source vector data if valid
if (threadIdx.x == 0 && threadIdx.y == 0 && is_src_valid) {
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>());
// printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{}));
}
// apply pointwise operation
static_for<0, ScalarPerVector, 1>{}([&](auto i) {
SrcData v;
......@@ -109,8 +117,31 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
// apply element-wise operation
element_op_(v, src_vector_container.template AsType<SrcData>()[i]);
// Emin @debug
// Debug: Print element-wise operation result
// if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
// }
// Debug: Print SrcData before and after applying element-wise operation
if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n",static_cast<int>(idx_1d.value), static_cast<int>(i.value), src_vector_container.template AsType<SrcData>().At(Number<i>{}));
// printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
printf("Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu \n", static_cast<int>(idx_1d.value) , static_cast<int>(i.value), v);
}
// apply type convert
dst_vector_container.template AsType<DstData>()(i) = type_convert<DstData>(v);
// Emin @debug
// Debug: Print type conversion result
if (threadIdx.x == 0 && threadIdx.y == 0) {
// printf("Threadwise_tensor slice v6r1r2 line 121 : Type Conversion Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>()[i]));
// printf("DstData after type conversion at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>().At(Number<i>{})));
printf("Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i %d: %hu \n", static_cast<int>(idx_1d.value) , static_cast<int>(i.value), dst_vector_container.template AsType<DstData>().At(Number<i>{}));
}
});
const bool is_dst_valid =
......@@ -122,6 +153,11 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
is_dst_valid,
dst_vector_container.template AsType<dst_vector_t>()[I0]);
// // Debug: Print data before copying from dst_vector into dst_buf
// if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) {
// printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<dst_vector_t>().At(I0));
// }
// move coordinate
if constexpr(idx_1d.value != num_access - 1)
{
......
......@@ -31,7 +31,7 @@ typename std::enable_if<
bool>::type
check_err(const Range& out,
const RefRange& ref,
const std::string& msg = "Error: Incorrect results!",
const std::string& msg = "#Emin @debug Error: Incorrect results!",
double rtol = 1e-5,
double atol = 3e-6)
{
......@@ -55,12 +55,30 @@ check_err(const Range& out,
{
max_err = err > max_err ? err : max_err;
err_count++;
// #Emin @debug // Added CK_LOGGING
// if(err_count < 5)
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
// if(err_count < ref.size())
// {
// std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
// << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
// }
res = false;
}else
{
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
}
if(!res)
......@@ -106,12 +124,34 @@ check_err(const Range& out,
{
max_err = err > max_err ? err : max_err;
err_count++;
// if(err_count < 5)
// {
// std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
// << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
// }
// res = false;
// #Emin @debug // Added CK_LOGGING
// if(err_count < 5)
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
if(err_count < 1000000)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}else
{
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
}
if(!res)
......@@ -156,12 +196,34 @@ check_err(const Range& out,
{
max_err = err > max_err ? err : max_err;
err_count++;
// if(err_count < 5)
// {
// std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
// << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
// }
// res = false;
// #Emin @debug // Added CK_LOGGING
// if(err_count < 5)
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
// if(err_count < ref.size())
// {
// std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
// << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
// }
res = false;
}else
{
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
}
if(!res)
......@@ -213,12 +275,34 @@ check_err(const Range& out,
{
max_err = err > max_err ? err : max_err;
err_count++;
// if(err_count < 5)
// {
// std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
// << std::endl;
// }
// res = false;
// #Emin @debug // Added CK_LOGGING
// if(err_count < 5)
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
// if(err_count < ref.size())
// {
// std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
// << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
// }
res = false;
}else
{
if(err_count < 5)
{
std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
<< std::endl;
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
}
if(!res)
......@@ -262,12 +346,34 @@ check_err(const Range& out,
{
max_err = err > max_err ? err : max_err;
err_count++;
// if(err_count < 5)
// {
// std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
// << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
// }
// res = false;
// #Emin @debug // Added CK_LOGGING
// if(err_count < 5)
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
// if(err_count < ref.size())
// {
// std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
// << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
// }
res = false;
}else
{
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
}
if(!res)
......@@ -308,12 +414,34 @@ check_err(const Range& out,
{
max_err = err > max_err ? err : max_err;
err_count++;
// if(err_count < 5)
// {
// std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
// << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
// }
// res = false;
// #Emin @debug // Added CK_LOGGING
// if(err_count < 5)
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
if(err_count < ref.size())
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}else
{
if(err_count < 5)
{
std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
<< "] != ref[" << i << "]: " << o << " != " << r << std::endl;
}
res = false;
}
}
}
if(!res)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment