off some trace

d891a596 · root · dc3519ae · d891a596 · d891a596 · d891a596
Commit d891a596 authored Dec 05, 2024 by root
3 changed files
--- a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
@@ -21,6 +21,82 @@ using CElementOp = PassThrough;

 static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;

+// clang-format off
+using DeviceGemmV2_Streamk_Instance = 
+    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
+        ALayout,   BLayout,  CLayout,   
+        ADataType,   BDataType,  CDataType,  AccDataType,  CShuffleDataType, 
+        PassThrough, PassThrough, PassThrough, GemmDefault, 
+        64,  // Block Size
+        16, // MPer Block
+        16, // NPer Block
+        64,  // KPer Block
+        8,   // AK1
+        8,   // BK1
+        16,  // MPer XDL
+        16,  // NPer XDL
+        1,   // Mxdl Per Wave
+        1,   // Nxdl Per Wave
+        S<8, 8, 1>,  // AblockTransfer ThreadCluster Lenghts_K0_M_kK1
+        S<1, 0, 2>,   // ABlockTransfer ThreadCluster ArrangeOrder
+        S<1, 0, 2>,   // ABlockTransfer SrcAccessOrder
+        2,            // ABlockTransfer SrcVectorDim
+        8,            // ABlockTransfer SrcScalar PerVector
+        8,            // ABlockTransfer DstScalar PerVector_K1
+        0,            // ABlockLds AddExtraM
+        S<8, 8, 1>,  // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+        S<1, 0, 2>,   // BBlockTransfer ThreadCluster ArrangeOrder
+        S<1, 0, 2>,   // BlockTransfer  SrcAccessOrder
+        2,            // BBlockTransfer SrcVectorDim
+        8,            // BBlockTransfer SrcScalar PerVector
+        8,            // BBlockTransfer DstScalar PerVector_K1
+        0,            // BBlocksLds AddExtraN
+        1,            // CShuffle MXdlPerWave PerShuffle
+        1,            // CShuffle NXdlPerWave PerShuffle
+        S<1, 16, 1, 4>, // CBlockTransferClusterLenghts _MBlock_MXdlPerWave_MWaveMPerXdl _NBlock_NXdlPerWave_NWaveNPerXdl
+        4,              // CBlockTransfer ScalarPerVector _NWaveNPerXdl
+        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
+// clang-format on
+
+
+// // clang-format off
+// using DeviceGemmV2_Streamk_Instance = 
+//     ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
+//         ALayout,   BLayout,  CLayout,   
+//         ADataType,   BDataType,  CDataType,  AccDataType,  CShuffleDataType, 
+//         PassThrough, PassThrough, PassThrough, GemmDefault, 
+//         64,  // Block Size
+//         16, // MPer Block
+//         16, // NPer Block
+//         128,  // KPer Block
+//         8,   // AK1
+//         8,   // BK1
+//         16,  // MPer XDL
+//         16,  // NPer XDL
+//         1,   // Mxdl Per Wave
+//         1,   // Nxdl Per Wave
+//         S<16, 4, 1>,  // AblockTransfer ThreadCluster Lenghts_K0_M_kK1
+//         S<1, 0, 2>,   // ABlockTransfer ThreadCluster ArrangeOrder
+//         S<1, 0, 2>,   // ABlockTransfer SrcAccessOrder
+//         2,            // ABlockTransfer SrcVectorDim
+//         8,            // ABlockTransfer SrcScalar PerVector
+//         8,            // ABlockTransfer DstScalar PerVector_K1
+//         0,            // ABlockLds AddExtraM
+//         S<16, 4, 1>,  // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+//         S<1, 0, 2>,   // BBlockTransfer ThreadCluster ArrangeOrder
+//         S<1, 0, 2>,   // BlockTransfer  SrcAccessOrder
+//         2,            // BBlockTransfer SrcVectorDim
+//         8,            // BBlockTransfer SrcScalar PerVector
+//         8,            // BBlockTransfer DstScalar PerVector_K1
+//         0,            // BBlocksLds AddExtraN
+//         1,            // CShuffle MXdlPerWave PerShuffle
+//         1,            // CShuffle NXdlPerWave PerShuffle
+//         S<1, 16, 1, 4>, // CBlockTransferClusterLenghts _MBlock_MXdlPerWave_MWaveMPerXdl _NBlock_NXdlPerWave_NWaveNPerXdl
+//         4,              // CBlockTransfer ScalarPerVector _NWaveNPerXdl
+//         ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
+// // clang-format on
+
+#if 0
 // clang-format off
 using DeviceGemmV2_Streamk_Instance = 
    ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
@@ -40,6 +116,8 @@ using DeviceGemmV2_Streamk_Instance =
        ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
 // clang-format on

+#endif
+
 using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -1149,13 +1149,15 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
        bool is_sk_block, is_dp_block;
        index_t num_k_block_main_loop;

-
+#if 0
        // Emin @debug
        // Debug: Print initial problem size and grid configuration
        
-        // if (threadIdx.x == 0 && threadIdx.y == 0) {
-        //     printf("Gridwise_gemm_sk Line:1157 Problem M: %d, N: %d, K: %d, Grid Size: %d\n", problem.M, problem.N, problem.K, problem.Grid_size);
-        // }
+        if (threadIdx.x == 0 && threadIdx.y == 0) {
+            printf("Gridwise_gemm_sk Line:1157 Problem M: %d, N: %d, K: %d, Grid Size: %d\n", problem.M, problem.N, problem.K, problem.Grid_size);
+        }
+
+#endif


        for(auto block_idx = get_block_1d_id();
@@ -1174,13 +1176,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
            block_2_ctile_map_streamk.get_block_itr(block_idx, iter_start, iter_end);
            num_k_block_main_loop = iter_end - iter_start;

+#if 1
            // Emin @debug
            // Debug: Print block information
            if (threadIdx.x == 0 && threadIdx.y == 0) {
                printf("Block Index: %d, Iteration Start: %d, Iteration End: %d, Is Stream-K: %d, Is Data-Parallel: %d\n",
                       block_idx, iter_start, iter_end, is_sk_block, is_dp_block);
            }
-
+#endif
            __syncthreads();

            while(true)
@@ -1216,6 +1219,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                // Emin @added
                __syncthreads();

+#if 0
+
                // Emin @debug
                // Debug: Print grid descriptor sizes
                if (threadIdx.x == 0 && threadIdx.y == 0) {
@@ -1225,6 +1230,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                           c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
                }

+#endif
+
                // Emin @added
                __syncthreads();

@@ -1259,6 +1266,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                // Emin @added
                __syncthreads();

+#if 0
                // Emin @debug
                // Debug: Print block data indices on grid
                if (threadIdx.x == 0 && threadIdx.y == 0) {
@@ -1266,7 +1274,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                           m_block_data_idx_on_grid, n_block_data_idx_on_grid, k0_block_data_idx_on_grid);
                }

-
+#endif
                // Emin @added
                __syncthreads();

@@ -1363,15 +1371,16 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                constexpr auto b_block_slice_copy_step =
                    make_multi_index(KPerBlock / BK1Number, 0, 0);

+#if 0
                // Emin @debug
                // Debug: Print shared memory buffer sizes for A and B
-                // if (threadIdx.x == 0 && threadIdx.y == 0) {
-                //     printf("Shared Memory Buffer Size - A: %d, B: %d\n",
-                //            a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
-                //            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
-                // }
-
+                if (threadIdx.x == 0 && threadIdx.y == 0) {
+                    printf("Shared Memory Buffer Size - A: %d, B: %d\n",
+                           a_block_desc_ak0_m_ak1.GetElementSpaceSize(),
+                           b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+                }

+#endif

                // Blockwise GEMM pipeline
                static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
@@ -1528,6 +1537,9 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                               n_thread_data_on_block_idx[I2]),
                              ck::tensor_operation::element_wise::PassThrough{}};

+
+// Emin @Note : I traced this !!  ThreadGroupTensorSliceTransfer_v6r1r2 -- >  Threadwise_tensor_slice_transfer_v6r1r2 !!!!!!!
+
                    // shuffle: blockwise copy C from LDS to global
                    auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1r2<
                        ThisThreadBlock,       // ThreadGroup
@@ -1588,16 +1600,34 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                          // Emin @added
                        __syncthreads();

+// Emin @Note : This prints always 0 !!!!! 
+#if 0
+
                        // Emin @debug
                        // Debug: Print before writing C to LDS
                        if (threadIdx.x == 0 && threadIdx.y == 0) {
-                            printf("Gridwise_gemm_sk line 1594 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
+                            printf("Gridwise_gemm_sk line 1606 --Block %d, Access %d: Writing C from VGPR to LDS.\n", block_idx, static_cast<int>(access_id));
                        }

+#endif

                         // Emin @added
                        __syncthreads();

+// Emin @debug
+// Debug: Print sfc_c_vgpr index tuple
+#if 1
+                        if (threadIdx.x == 0 && threadIdx.y == 0) {
+                            auto index_tuple = sfc_c_vgpr.GetIndexTupleOfNumber(access_id);
+                            printf("Gridwise_gemm_sk Debug line 1618--Block %d, Access %d: sfc_c_vgpr Index Tuple = (%d, %d, %d, ...).\n", 
+                                block_idx, static_cast<int>(access_id), 
+                                static_cast<int>(index_tuple.At(Number<0>{})), 
+                                static_cast<int>(index_tuple.At(Number<1>{})),
+                                static_cast<int>(index_tuple.At(Number<2>{}))); // Adjust based on tuple size
+                        }
+#endif
+
+
                        // each thread write its data from VGPR to LDS
                        c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                                                      sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
@@ -1628,9 +1658,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                            // Emin @added
                            __syncthreads();

+#if 1
+                            // Emin @debug
                            if (threadIdx.x == 0 && threadIdx.y == 0) {
-                                printf("Gridwise_gemm_sk line 1602 --is_sk_block !! each block copy data from LDS to global.\n");
+                                printf("Gridwise_gemm_sk line 1662 --is_sk_block !! each block copy data from LDS to global #Start.\n");
                            }
+#endif
+

                            // Emin @added
                            __syncthreads();
@@ -1645,6 +1679,14 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                    c_shuffle_block_buf,
                                    c_grid_desc_mblock_mperblock_nblock_nperblock,
                                    c_grid_buf);
+
+#if 1
+                            // Emin @debug
+                            if (threadIdx.x == 0 && threadIdx.y == 0) {
+                                printf("Gridwise_gemm_sk line 1684 --is_sk_block !! each block copy data from LDS to global #End.\n");
+                            }
+#endif
+
                        }

                        if constexpr(access_id < num_access - 1)

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp
@@ -103,13 +103,13 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
            auto dst_vector_container = dst_vector_type{};

            // Emin @debug
-             // Debug: Print source vector data if valid
+            // Debug: Print source vector data if valid
            if (threadIdx.x == 0 && threadIdx.y == 0 && is_src_valid) {
                // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f\n", static_cast<int>(idx_1d.value), static_cast<float>());
-                printf("BlockId %d -  Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
+                printf("BlockId %d -  Threadwise_tensor slice v6r1r2 line 109: Src Vector Data at idx %d: %f \n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<0>{})));
                // printf("Threadwise_tensor slice v6r1r2 line 108: Src Vector Data at idx %d: %hu \n", static_cast<int>(idx_1d.value), src_vector_container.template AsType<SrcData>().At(Number<0>{}));
            }
-            // Emin@debug
+            // Emin @debug

            // apply pointwise operation
            static_for<0, ScalarPerVector, 1>{}([&](auto i) {
@@ -120,14 +120,15 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2

                // Emin @debug
                // Debug: Print element-wise operation result
-                // if (threadIdx.x == 0 && threadIdx.y == 0) {
-                //     printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
-                // }
+                if (threadIdx.x == 0 && threadIdx.y == 0) {
+                    printf("Threadwise_tensor slice v6r1r2 line 121 : Element-wise Operation Result at idx %d: %f\n", static_cast<int>(i.value), static_cast<float>(v));
+                }

                // Emin @added
                __syncthreads();

-
+ // Emin @debug
+ #if 1
                 // Debug: Print SrcData before and after applying element-wise operation
                if (threadIdx.x == 0 && threadIdx.y == 0) {
                    // printf("Threadwise_tensor_slice_v6r1r2 line 127 : SrcData before element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(src_vector_container.template AsType<SrcData>().At(Number<i>{})));
@@ -139,7 +140,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
                    // printf("SrcData after element-wise op at idx %d: %f \n", static_cast<int>(i.value), static_cast<float>(v));
                    printf("BlockId %d -  Threadwise_tensor_slice_v6r1r2 line 129 : SrcData after element-wise op at idx %d , i %d: %f \n" , static_cast<int>(blockIdx.x) ,  static_cast<int>(idx_1d.value) , static_cast<int>(i.value), static_cast<float>(v));
                }
-
+#endif
                // Emin @added
                __syncthreads();

@@ -171,14 +172,18 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
                dst_coord_.GetOffset(),
                is_dst_valid,
                dst_vector_container.template AsType<dst_vector_t>()[I0]);
+                
+#if 0
+            // Emin @debug
+            //  // Debug: Print data before copying from dst_vector into dst_buf
+            if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) {
+                // printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<DstData>().At(I0));
+                // printf("BlockId %d -  Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));

-            // // //  // Debug: Print data before copying from dst_vector into dst_buf
-            // if (threadIdx.x == 0 && threadIdx.y == 0 && is_dst_valid) {
-            //     // printf("Dst Vector Data being copied to dst_buf at idx %d: %v4hu", static_cast<int>(idx_1d.value), dst_buf.template AsType<DstData>().At(I0));
-            //     // printf("BlockId %d -  Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
+                printf("BlockId %d -  Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
+            }

-            //     printf("BlockId %d -  Dst Vector Data being copied to dst_buf at idx %d: %hu\n", static_cast<int>(blockIdx.x) , static_cast<int>(idx_1d.value), dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid));
-            // }
+#endif

            // move coordinate
            if constexpr(idx_1d.value != num_access - 1)