trace copy

be8e0766 · root · 6f210155 · be8e0766 · be8e0766 · be8e0766
Commit be8e0766 authored Nov 14, 2024 by root
5 changed files
--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
@@ -10,10 +10,14 @@ struct StreamConfig
 {
    hipStream_t stream_id_ = nullptr;
    bool time_kernel_      = false;
-    int log_level_         = 0;
-    int cold_niters_       = 5;
-    int nrepeat_           = 50;
+    int log_level_         = 1;
+    int cold_niters_       = 1;
+    int nrepeat_           = 1;

    bool flush_cache   = false;
    int rotating_count = 1;
 };
+
+// #Emin
+// cold_niters=  5
+// nrepeat_ = 50
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -10,6 +10,7 @@
 #include <limits>
 #include <stdlib.h>

+
 namespace ck {

 // Rows of column-vectors
@@ -1180,29 +1181,8 @@ struct BlockToCTileMap_GemmStreamK
            eqav_tiles_little     = MDiv(upper_little / k_iters_per_tile.get());
        }

-#if 0
-        printf("cu:%d, occupancy:%d, grids:%d, num_tiles:%d, dp_tiles:%d, sk_num_big_blocks:%d, "
-               "sk_num_blocks:%d, "
-               "sk_total_iters:%d, dp_start_block_idx:%d, dp_iters_per_block:%d, dp_num_blocks:%d, "
-               "k_iters_per_tile:%d, k_iters_per_big_block:%d, reduction_start_block_idx:%u, "
-               "sk_tiles:%u, workspace(acc float):%u\n",
-               num_cu,
-               occupancy,
-               get_grid_dims().x,
-               num_tiles,
-               dp_tiles,
-               sk_num_big_blocks,
-               sk_num_blocks,
-               sk_total_iters,
-               dp_start_block_idx,
-               dp_iters_per_block,
-               dp_num_blocks,
-               k_iters_per_tile.get(),
-               k_iters_per_big_block,
-               reduction_start_block_idx,
-               get_sk_tiles(),
-               get_workspace_size(sizeof(float)));
-#endif
+       
+
    }

    __host__ __device__ uint32_t get_sk_total_iters() const
@@ -1517,6 +1497,30 @@ struct BlockToCTileMap_GemmStreamK_v2
            equiv_tiles_big       = MDiv(upper_big / k_iters_per_tile.get());
            equiv_tiles_little    = MDiv(upper_little / k_iters_per_tile.get());
        }
+
+      
+        #if 0
+            printf("grid_size:%d, num_tiles:%d, dp_tiles:%d, sk_num_big_blocks:%d, "
+                "sk_num_blocks:%d, "
+                "sk_total_iters:%d, dp_start_block_idx:%d,dp_num_blocks:%d, "
+                "k_iters_per_tile:%d, k_iters_per_big_block:%d, reduction_start_block_idx:%u, "
+                "sk_tiles:%u, workspace(acc float):%u\n",
+                grid_size,
+                num_tiles,
+                dp_tiles,
+                sk_num_big_blocks,
+                sk_num_blocks,
+                sk_total_iters,
+                dp_start_block_idx,
+                dp_num_blocks,
+                k_iters_per_tile.get(),
+                k_iters_per_big_block,
+                reduction_start_block_idx,
+                get_sk_tiles(),
+                get_workspace_size(sizeof(float)));
+        
+        #endif
+ 
    }

    __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -1065,6 +1065,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
            }
        }

+        // #Emin# Need to make this code comment out or remove
        if constexpr(is_same<remove_cvref_t<CDataType>, bhalf_t>::value)
        {
            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
@@ -1148,11 +1149,20 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
        bool is_sk_block, is_dp_block;
        index_t num_k_block_main_loop;

+
+        // Emin @debug
+        // Debug: Print initial problem size and grid configuration
+        
+        // if (threadIdx.x == 0 && threadIdx.y == 0) {
+        //     printf("Gridwise_gemm_sk Line:1157 Problem M: %d, N: %d, K: %d, Grid Size: %d\n", problem.M, problem.N, problem.K, problem.Grid_size);
+        // }
+
+
        for(auto block_idx = get_block_1d_id();
            block_idx < block_2_ctile_map_streamk.get_grid_dims();
            block_idx += gridDim.x)
        {
-
+            // Determine if the block is stream-k or data-parallel
            is_sk_block =
                static_cast<uint32_t>(block_idx) < block_2_ctile_map_streamk.sk_num_blocks;
            is_dp_block =
@@ -1160,9 +1170,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                static_cast<uint32_t>(block_idx) <
                    block_2_ctile_map_streamk.reduction_start_block_idx;

+            // Get the iteration range for the current block
            block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
            num_k_block_main_loop = iter_end - iter_start;

+            // Emin @debug
+            // Debug: Print block information
+            if (threadIdx.x == 0 && threadIdx.y == 0) {
+                printf("Block Index: %d, Iteration Start: %d, Iteration End: %d, Is Stream-K: %d, Is Data-Parallel: %d\n",
+                       block_idx, iter_start, iter_end, is_sk_block, is_dp_block);
+            }
+
            while(true)
            {
                uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
@@ -1173,6 +1191,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                    iter_end - 1, tile_idx, iter_offset);
                iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);

+                // Create grid descriptors for A, B, C matrices
                const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(problem.M,
                                                                                 problem.MPadded,
                                                                                 problem.K,
@@ -1191,6 +1210,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
                    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                        c_grid_desc_m_n, problem.MBlock, problem.NBlock);
+
+                // Emin @debug
+                // Debug: Print grid descriptor sizes
+                if (threadIdx.x == 0 && threadIdx.y == 0) {
+                    printf("A Grid Desc Size: %ld, B Grid Desc Size: %ld, C Grid Desc Size: %ld\n",
+                           a_grid_desc_ak0_m_ak1.GetElementSpaceSize(),
+                           b_grid_desc_bk0_n_bk1.GetElementSpaceSize(),
+                           c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+                }
+
+                // Create dynamic buffers for A, B, C matrices in global memory       
                auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                    p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());

@@ -1200,6 +1230,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                    p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());

+                 // Map tile index to spatial coordinates (M, N)
+
                auto block_work_idx =
                    block_2_ctile_map_streamk.tile_to_spatial(tile_idx, problem.M, problem.N);

@@ -1216,9 +1248,17 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                const index_t k0_block_data_idx_on_grid =
                    __builtin_amdgcn_readfirstlane(iter_offset * AK0Number);

+                // Emin @debug
+                // Debug: Print block data indices on grid
+                if (threadIdx.x == 0 && threadIdx.y == 0) {
+                    printf("M Block Data Index on Grid: %d, N Block Data Index on Grid: %d, K0 Block Data Index: %d\n",
+                           m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid);
+                }
+
                // lds max alignment
                constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);

+                // Create shared memory buffers for A and B matrices in LDS
                // A matrix in LDS memory, dst of blockwise copy
                constexpr auto a_block_desc_ak0_m_ak1 =
                    GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
@@ -1308,6 +1348,16 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                constexpr auto b_block_slice_copy_step =
                    make_multi_index(KPerBlock / BK1Number, 0, 0);

+                // Emin @debug
+                // Debug: Print shared memory buffer sizes for A and B
+                // if (threadIdx.x == 0 && threadIdx.y == 0) {
+                //     printf("Shared Memory Buffer Size - A: %d, B: %d\n",
+                //            a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
+                //            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+                // }
+
+
+
                // Blockwise GEMM pipeline
                static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
                auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
@@ -1317,6 +1367,12 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                    (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
                    KPerBlock);

+                // Emin @debug
+                // Debug: Print number of K blocks in main loop
+                if (threadIdx.x == 0 && threadIdx.y == 0) {
+                    printf("Number of K Blocks in Main Loop: %d\n", num_k_block_main_loop);
+                }
+
                blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
                    a_grid_desc_ak0_m_ak1,
                    a_block_desc_ak0_m_ak1,
@@ -1508,6 +1564,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                        // make sure it's safe to write to LDS
                        block_sync_lds();

+                        // Emin @debug
+                        // Debug: Print before writing C to LDS
+                        if (threadIdx.x == 0 && threadIdx.y == 0) {
+                            printf("Gridwise_gemm_sk line 1570 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
+                        }
+
+
                        // each thread write its data from VGPR to LDS
                        c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                                                      sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
@@ -1535,6 +1598,10 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                        }
                        else if(is_sk_block)
                        {
+                            if (threadIdx.x == 0 && threadIdx.y == 0) {
+                                printf("Gridwise_gemm_sk line 1602 --is_sk_block !! each block copy data from LDS to global.\n");
+                            }
+
                            // each block copy its data from LDS to global
                            c_shuffle_block_copy_lds_to_global
                                .template Run<decltype(c_shuffle_block_buf),

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp
@@ -102,6 +102,14 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2

            auto dst_vector_container = dst_vector_type{};

+            // Emin @debug
+             // Debug: Print source vector data if valid
+            if (threadIdx.x == 0 && threadIdx.y == 0 && is_src_valid) {
+                // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>());
+                // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
+                printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{}));
+            }
+
            // apply pointwise operation
            static_for<0, ScalarPerVector, 1>{}([&](auto i) {
                SrcData v;
@@ -109,8 +117,31 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
                // apply element-wise operation
                element_op_(v, src_vector_container.template AsType<SrcData>()[i]);

+                // Emin @debug
+                // Debug: Print element-wise operation result
+                // if (threadIdx.x == 0 && threadIdx.y == 0) {
+                //     printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
+                // }
+
+                 // Debug: Print SrcData before and after applying element-wise operation
+                if (threadIdx.x == 0 && threadIdx.y == 0) {
+                    // printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
+                    printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d , i %d: %hu \n",static_cast<int>(idx_1d.value),  static_cast<int>(i.value), src_vector_container.template AsType<SrcData>().At(Number<i>{}));
+                    // printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
+                    printf("Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %hu \n", static_cast<int>(idx_1d.value) , static_cast<int>(i.value), v);
+                }
+
                // apply type convert
                dst_vector_container.template AsType<DstData>()(i) = type_convert<DstData>(v);
+
+                // Emin @debug
+                // Debug: Print type conversion result
+                if (threadIdx.x == 0 && threadIdx.y == 0) {
+                    // printf("Threadwise_tensor slice v6r1r2 line 121 : Type Conversion Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>()[i]));
+                    //   printf("DstData after type conversion at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(dst_vector_container.template AsType<DstData>().At(Number<i>{})));
+                    printf("Threadwise_tensor_slice_v6r1r2 line 140 : DstData after type conversion at idx %d, i  %d: %hu \n", static_cast<int>(idx_1d.value) , static_cast<int>(i.value), dst_vector_container.template AsType<DstData>().At(Number<i>{}));
+                
+                }
            });

            const bool is_dst_valid =
@@ -122,6 +153,11 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
                is_dst_valid,
                dst_vector_container.template AsType<dst_vector_t>()[I0]);

+            //  // Debug: Print data before copying from dst_vector into dst_buf
+            // if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) {
+            //     printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<dst_vector_t>().At(I0));
+            // }
+
            // move coordinate
            if constexpr(idx_1d.value != num_access - 1)
            {

--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -31,7 +31,7 @@ typename std::enable_if<
    bool>::type
 check_err(const Range& out,
          const RefRange& ref,
-          const std::string& msg = "Error: Incorrect results!",
+          const std::string& msg = "#Emin @debug Error: Incorrect results!",
          double rtol            = 1e-5,
          double atol            = 3e-6)
 {
@@ -55,12 +55,30 @@ check_err(const Range& out,
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
+            
+            // #Emin @debug // Added CK_LOGGING
+            // if(err_count < 5)
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                // if(err_count < ref.size())
+                // {
+                //     std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                //             << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+                // }
+                res = false;
+
+            }else
+            {
                if(err_count < 5)
                {
                    std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
                            << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
                }
                res = false;
+
+            }
+
+           
        }
    }
    if(!res)
@@ -106,12 +124,34 @@ check_err(const Range& out,
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
+            // if(err_count < 5)
+            // {
+            //     std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+            //               << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            // }
+            // res = false;
+
+             // #Emin @debug // Added CK_LOGGING
+            // if(err_count < 5)
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                if(err_count < 1000000)
+                {
+                    std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                            << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+                }
+                res = false;
+
+            }else
+            {
                if(err_count < 5)
                {
                    std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
                            << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
                }
                res = false;
+
+            }
        }
    }
    if(!res)
@@ -156,12 +196,34 @@ check_err(const Range& out,
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
+            // if(err_count < 5)
+            // {
+            //     std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+            //               << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            // }
+            // res = false;
+
+             // #Emin @debug // Added CK_LOGGING
+            // if(err_count < 5)
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                // if(err_count < ref.size())
+                // {
+                //     std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                //             << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+                // }
+                res = false;
+
+            }else
+            {
                if(err_count < 5)
                {
                    std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
                            << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
                }
                res = false;
+
+            }
        }
    }
    if(!res)
@@ -213,12 +275,34 @@ check_err(const Range& out,
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
+            // if(err_count < 5)
+            // {
+            //     std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
+            //               << std::endl;
+            // }
+            // res = false;
+
+             // #Emin @debug // Added CK_LOGGING
+            // if(err_count < 5)
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                // if(err_count < ref.size())
+                // {
+                //     std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                //             << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+                // }
+                res = false;
+
+            }else
+            {
                if(err_count < 5)
                {
-                std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
-                          << std::endl;
+                    std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                            << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
                }
                res = false;
+
+            }
        }
    }
    if(!res)
@@ -262,12 +346,34 @@ check_err(const Range& out,
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
+            // if(err_count < 5)
+            // {
+            //     std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+            //               << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            // }
+            // res = false;
+
+             // #Emin @debug // Added CK_LOGGING
+            // if(err_count < 5)
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                // if(err_count < ref.size())
+                // {
+                //     std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                //             << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+                // }
+                res = false;
+
+            }else
+            {
                if(err_count < 5)
                {
                    std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
                            << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
                }
                res = false;
+
+            }
        }
    }
    if(!res)
@@ -308,12 +414,34 @@ check_err(const Range& out,
        {
            max_err = err > max_err ? err : max_err;
            err_count++;
+            // if(err_count < 5)
+            // {
+            //     std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+            //               << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            // }
+            // res = false;
+
+             // #Emin @debug // Added CK_LOGGING
+            // if(err_count < 5)
+            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+            {
+                if(err_count < ref.size())
+                {
+                    std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                            << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+                }
+                res = false;
+
+            }else
+            {
                if(err_count < 5)
                {
                    std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
                            << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
                }
                res = false;
+
+            }
        }
    }
    if(!res)