init code for tile skipping

506c8eb3 · wangshaojie6 · 3f9100cc · 506c8eb3 · 506c8eb3 · 506c8eb3
Commit 506c8eb3 authored Sep 13, 2022 by wangshaojie6
3 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -164,8 +164,8 @@ int main(int argc, char* argv[])
    // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape
    // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o])
    // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3])
-    ck::index_t G0 = 7;
+    ck::index_t G0 = 2;
-    ck::index_t G1 = 13;
+    ck::index_t G1 = 3;
    if(argc == 1)
    {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -591,6 +591,17 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
            A1ThreadSlice_K0_M_K1,
            make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1));
+#if 0
+        if(threadIdx.x == 0)
+        {
+            printf("bid=%d, A1ThreadSliceK0=%d, A1ThreadSliceM=%d, A1ThreadSliceK1=%d\n",
+                static_cast<int>(blockIdx.x), 
+                static_cast<int>(A1ThreadSliceK0), 
+                static_cast<int>(A1ThreadSliceM),
+                static_cast<int>(A1ThreadSliceK1));
+        }
+#endif
        // B1 matrix in LDS memory, dst of blockwise copy
        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
@@ -753,6 +764,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
        index_t gemm1_k_block_outer_index = 0;
        do
        {
+            if((m_block_data_idx_on_grid < gemm1_k_block_outer_index * NPerBlock) && ((m_block_data_idx_on_grid + MPerBlock - 1) < (gemm1_k_block_outer_index * NPerBlock + NPerBlock - 1)))
+            {
+                continue;
+            }
            // gemm0
            gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
                                                                   a_block_desc_ak0_m_ak1,

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
@@ -195,7 +195,7 @@ struct ReferenceBatchedGemmUpperTriangleMinusInf : public device::BaseOperator
                AccDataType v_c;
-                if(n <= m)
+                if(((n >> 7) << 7) <= ((m >> 7) << 7))
                {
                    arg.c_element_op_(v_c, v_acc);
                }