adding conv multiple D

0b997ce4 · Chao Liu · 69d323de · 0b997ce4 · 0b997ce4 · 0b997ce4
Commit 0b997ce4 authored Jul 18, 2022 by Chao Liu
4 changed files
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -16,7 +16,7 @@ using S = ck::Sequence<Is...>;

 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::UnaryConvert;

 static constexpr auto ConvFwdDefault =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
@@ -48,18 +48,18 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
    2,              // ABlockTransferSrcVectorDim
    8,              // ABlockTransferSrcScalarPerVector
    8,              // ABlockTransferDstScalarPerVector_K1
-    true,           // ABlockLdsAddExtraM
+    true,           // ABlockLdsExtraM
    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
    S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
    S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
    2,              // BBlockTransferSrcVectorDim
    8,              // BBlockTransferSrcScalarPerVector
    8,              // BBlockTransferDstScalarPerVector_K1
-    true,           // BBlockLdsAddExtraN
+    true,           // BBlockLdsExtraN
    7,              // CThreadTransferSrcDstVectorDim
    1>;             // CThreadTransferDstScalarPerVector
 #else
-using CShuffleDataType = float;
+using CShuffleDataType = ck::half_t;

 template <ck::index_t NDimSpatial>
 using DeviceConvNDFwdInstance =
@@ -69,16 +69,17 @@ using DeviceConvNDFwdInstance =
        WeiDataType,      //
        AccDataType,      //
        CShuffleDataType, //
-        ck::Tuple<>,
+        ck::Tuple<>,      //
        OutDataType,      //
        InElementOp,      // Input Elementwise Operation
        WeiElementOp,     // Weights Elementwise Operation
        OutElementOp,     // Output Elementwise Operation
        ConvFwdDefault,   // ConvForwardSpecialization
+        1,                //
        256,              // BlockSize
        128,              // MPerBlock
        256,              // NPerBlock
-        4,              // K0PerBlock
+        32,               // KPerBlock
        8,                // K1
        32,               // MPerXdl
        32,               // NPerXdl
@@ -90,16 +91,18 @@ using DeviceConvNDFwdInstance =
        2,                // ABlockTransferSrcVectorDim
        8,                // ABlockTransferSrcScalarPerVector
        8,                // ABlockTransferDstScalarPerVector_K1
-        true,           // ABlockLdsAddExtraM
+        1,                // ABlockLdsExtraM
        S<4, 64, 1>,      // BBlockTransferThreadClusterLengths_K0_N_K1
        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
        2,                // BBlockTransferSrcVectorDim
        8,                // BBlockTransferSrcScalarPerVector
        8,                // BBlockTransferDstScalarPerVector_K1
-        true,           // BBlockLdsAddExtraN
-        7,              // CThreadTransferSrcDstVectorDim
-        1>;             // CThreadTransferDstScalarPerVector
+        1,                // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
 #endif

 int main(int argc, char* argv[])

--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_multiple_d_nwc_kxc_nwk_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_multiple_d_nwc_kxc_nwk_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -618,18 +618,18 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                                              arg.block_2_etile_map_);
            };

-            float ave_time = 0;
+            float avg_time = 0;

            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                avg_time = launch_kernel(integral_constant<bool, true>{});
            }
            else
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                avg_time = launch_kernel(integral_constant<bool, false>{});
            }

-            return ave_time;
+            return avg_time;
        }

        // polymorphic

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -12,16 +12,47 @@ namespace element_wise {

 struct PassThrough
 {
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<double, double>(double& y, const double& x) const
    {
-        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
-                          is_same<T, half_t>::value || is_same<T, bhalf_t>::value ||
-                          is_same<T, int32_t>::value || is_same<T, int8_t>::value,
-                      "Data type is not supported by this operation!");
+        y = x;
+    }

+    template <>
+    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
+    {
        y = x;
-    };
+    }
+
+    template <>
+    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
+    {
+        y = x;
+    }
+};
+
+struct UnaryConvert
+{
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
+    {
+        y = type_convert<Y>(x);
+    }
 };

 struct Scale