Restrict 4gemm to PassThrough + bug fix

7d2fa996 · root · bda26547 · 7d2fa996 · 7d2fa996
Commit 7d2fa996 authored May 27, 2022 by root
2 changed files
--- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -42,48 +42,54 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-template <typename ALayout,
+template <
-          typename BLayout,
+    typename ALayout,
-          typename CLayout,
+    typename BLayout,
-          typename ADataType,
+    typename CLayout,
-          typename BDataType,
+    typename ADataType,
-          typename CDataType,
+    typename BDataType,
-          typename GemmAccDataType,
+    typename CDataType,
-          typename CShuffleDataType,
+    typename GemmAccDataType,
-          typename AElementwiseOperation,
+    typename CShuffleDataType,
-          typename BElementwiseOperation,
+    typename AElementwiseOperation,
-          typename CElementwiseOperation,
+    typename BElementwiseOperation,
-          GemmSpecialization GemmSpec,
+    typename CElementwiseOperation,
-          index_t NumGemmKPrefetchStage,
+    GemmSpecialization GemmSpec,
-          index_t BlockSize,
+    index_t NumGemmKPrefetchStage,
-          index_t MPerBlock,
+    index_t BlockSize,
-          index_t NPerBlock,
+    index_t MPerBlock,
-          index_t KPerBlock,
+    index_t NPerBlock,
-          index_t AK1,
+    index_t KPerBlock,
-          index_t BK1,
+    index_t AK1,
-          index_t MPerXDL,
+    index_t BK1,
-          index_t NPerXDL,
+    index_t MPerXDL,
-          index_t MXdlPerWave,
+    index_t NPerXDL,
-          index_t NXdlPerWave,
+    index_t MXdlPerWave,
-          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+    index_t NXdlPerWave,
-          typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
-          typename ABlockTransferSrcAccessOrder,
+    typename ABlockTransferThreadClusterArrangeOrder,
-          index_t ABlockTransferSrcVectorDim,
+    typename ABlockTransferSrcAccessOrder,
-          index_t ABlockTransferSrcScalarPerVector,
+    index_t ABlockTransferSrcVectorDim,
-          index_t ABlockTransferDstScalarPerVector_AK1,
+    index_t ABlockTransferSrcScalarPerVector,
-          bool ABlockLdsExtraM,
+    index_t ABlockTransferDstScalarPerVector_AK1,
-          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+    bool ABlockLdsExtraM,
-          typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
-          typename BBlockTransferSrcAccessOrder,
+    typename BBlockTransferThreadClusterArrangeOrder,
-          index_t BBlockTransferSrcVectorDim,
+    typename BBlockTransferSrcAccessOrder,
-          index_t BBlockTransferSrcScalarPerVector,
+    index_t BBlockTransferSrcVectorDim,
-          index_t BBlockTransferDstScalarPerVector_BK1,
+    index_t BBlockTransferSrcScalarPerVector,
-          bool BBlockLdsExtraN,
+    index_t BBlockTransferDstScalarPerVector_BK1,
-          index_t CShuffleMXdlPerWavePerShuffle,
+    bool BBlockLdsExtraN,
-          index_t CShuffleNXdlPerWavePerShuffle,
+    index_t CShuffleMXdlPerWavePerShuffle,
-          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+    index_t CShuffleNXdlPerWavePerShuffle,
-          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+    typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-          LoopScheduler LoopSched = make_default_loop_scheduler()>
+    index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+    LoopScheduler LoopSched = make_default_loop_scheduler(),
+    enable_if_t<
+        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+        bool> = false>
 struct DeviceCGemm_4Gemm_Xdl_CShuffle
    : public DeviceCGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
 {

--- a/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
@@ -60,8 +60,8 @@ template <
    index_t CThreadTransferDstScalarPerVector,
    enable_if_t<
        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
-            is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
-            is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
        bool> = false>
 struct DeviceGemmDl
    : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>