fix build (#434)

* fix * fix * add instance

fix build (#434)
* fix * fix * add instance
e9d4e893 · Chao Liu · GitHub · aa0b0515 · e9d4e893 · e9d4e893
Unverified Commit e9d4e893 authored Sep 22, 2022 by Chao Liu Committed by GitHub Sep 22, 2022
3 changed files
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -332,7 +332,10 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
+              cde_element_op_{cde_element_op},
+              MRaw_{MRaw},
+              NRaw_{NRaw},
+              KRaw_{KRaw}
        {
            // populate pointer, desc for Ds
            static_for<0, NumDTensor, 1>{}([&](auto i) {
@@ -400,6 +403,11 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        AElementwiseOperation a_element_op_;
        BElementwiseOperation b_element_op_;
        CDEElementwiseOperation cde_element_op_;
+        // for checking vector load/store
+        index_t MRaw_;
+        index_t NRaw_;
+        index_t KRaw_;
    };
    // Invoker
@@ -486,6 +494,86 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
            return false;
        }
+        // check vector load/store
+        {
+            using Row = ck::tensor_layout::gemm::RowMajor;
+            using Col = ck::tensor_layout::gemm::ColumnMajor;
+            // check vector load of A
+            if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+            // check vector laod of B
+            if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+            // check vector load of Ds
+            // only support RowMajor for now
+            bool all_valid = true;
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                if constexpr(!is_same_v<DLayout, Row>)
+                {
+                    all_valid = false;
+                }
+            });
+            if(!all_valid)
+            {
+                return false;
+            }
+            // check vector store of E
+            // only support RowMajor for now
+            if constexpr(is_same_v<ELayout, Row>)
+            {
+                if(arg.NRaw_ % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                           arg.b_grid_desc_n_k_,
                                           arg.ds_grid_desc_m_n_,

--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -3,27 +3,27 @@
 #include <cstring>
-// int profile_gemm(int, char*[]);
+int profile_gemm(int, char*[]);
-// int profile_gemm_splitk(int, char*[]);
+int profile_gemm_splitk(int, char*[]);
-// int profile_gemm_bilinear(int, char*[]);
+int profile_gemm_bilinear(int, char*[]);
-// int profile_gemm_add_add_fastgelu(int, char*[]);
+int profile_gemm_add_add_fastgelu(int, char*[]);
-// int profile_gemm_reduce(int, char*[]);
+int profile_gemm_reduce(int, char*[]);
-// int profile_gemm_bias_add_reduce(int, char*[]);
+int profile_gemm_bias_add_reduce(int, char*[]);
-// int profile_batched_gemm(int, char*[]);
+int profile_batched_gemm(int, char*[]);
-// int profile_batched_gemm_gemm(int, char*[]);
+int profile_batched_gemm_gemm(int, char*[]);
-// int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
+int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
-// int profile_batched_gemm_reduce(int, char*[]);
+int profile_batched_gemm_reduce(int, char*[]);
-// int profile_grouped_gemm(int, char*[]);
+int profile_grouped_gemm(int, char*[]);
-// int profile_conv_fwd(int, char*[]);
+int profile_conv_fwd(int, char*[]);
-// int profile_conv_fwd_bias_relu(int, char*[]);
+int profile_conv_fwd_bias_relu(int, char*[]);
-// int profile_conv_fwd_bias_relu_add(int, char*[]);
+int profile_conv_fwd_bias_relu_add(int, char*[]);
-// int profile_conv_bwd_data(int, char*[]);
+int profile_conv_bwd_data(int, char*[]);
-// int profile_conv_bwd_weight(int, char*[]);
+int profile_conv_bwd_weight(int, char*[]);
-// int profile_grouped_conv_fwd(int, char*[]);
+int profile_grouped_conv_fwd(int, char*[]);
-// int profile_normalization(int, char*[]);
+int profile_normalization(int, char*[]);
 int profile_layernorm(int, char*[]);
 int profile_groupnorm(int, char*[]);
-// int profile_reduce(int, char*[]);
+int profile_reduce(int, char*[]);
 static void print_helper_message()
 {
@@ -57,7 +57,6 @@ int main(int argc, char* argv[])
        return 0;
    }
-#if 0
    else if(strcmp(argv[1], "gemm") == 0)
    {
        return profile_gemm(argc, argv);
@@ -134,7 +133,6 @@ int main(int argc, char* argv[])
    {
        return profile_normalization(argc, argv);
    }
-#endif
    else if(strcmp(argv[1], "layernorm") == 0)
    {
        return profile_layernorm(argc, argv);