add unary type convert to bwd-weight example

c6b52884 · wangshaojie6 · c4b6b9b1 · c6b52884 · c6b52884 · c6b52884
Commit c6b52884 authored May 30, 2022 by wangshaojie6
6 changed files
--- a/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
+++ b/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl.cpp
--- a/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
+++ b/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
@@ -15,6 +15,7 @@
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
 #include "element_wise_operation.hpp"
+#include "device_unary_elementwise.hpp"
 #include "device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
 #include "reference_conv_backward_weight.hpp"

@@ -30,6 +31,11 @@ using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;

+using UnaryTypeConvert = ck::tensor_operation::element_wise::UnaryTypeConvert<ck::bhalf_t, float>;
+
+using DeviceUnaryElementwiseTypeConvertInstance = ck::tensor_operation::device::
+    DeviceUnaryElementwise<AccDataType, WeiDataType, UnaryTypeConvert, 1, 4>;
+
 static constexpr auto ConvBwdWeightDefault =
    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;

@@ -95,7 +101,7 @@ void host_elementwise(HostTensorB& B,
                      Functor functor)
 {
    size_t tensor_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
-    std::cout << __LINE__ << ":" << tensor_size << ", "<< A.mData[0] << std::endl;
+    std::cout << __LINE__ << ":" << tensor_size << ", " << A.mData[0] << std::endl;
    for(std::size_t n = 0; n < tensor_size; ++n)
    {
        B.mData[n] = functor(A.mData[n]);
@@ -318,7 +324,8 @@ int main(int argc, char* argv[])

    // alloc work space
    size_t bwd_weight_workspace_size = conv->GetWorkSpaceSize(argument.get());
-    float ave_time                   = 0.f;
+    float conv_ave_time              = 0.f;
+    float type_convert_ave_time      = 0.f;

    DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
    wei_work_space_device_buf.SetZero();
@@ -349,17 +356,42 @@ int main(int argc, char* argv[])
        return 1;
    }

-    ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    conv_ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+
+    // do type convert
+    auto type_convert         = DeviceUnaryElementwiseTypeConvertInstance{};
+    auto type_convert_invoker = type_convert.MakeInvokerPointer();
+    int tensor_size =
+        std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});
+    auto type_convert_argument =
+        type_convert.MakeArgumentPointer(wei_work_space_device_buf.GetDeviceBuffer(),
+                                         wei_device_buf.GetDeviceBuffer(),
+                                         {tensor_size},
+                                         {1},
+                                         {1},
+                                         UnaryTypeConvert{});
+
+    if(!type_convert.IsSupportedArgument(type_convert_argument.get()))
+    {
+        std::cout << "wrong! device_type_convert with the specified compilation parameters does "
+                     "not support this convert problem"
+                  << std::endl;
+        return 1;
+    }
+    type_convert_ave_time =
+        type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
+    // type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});

    // host code to check if conv give me a right result
-    Tensor<AccDataType> wei_k_c_y_x_device_result_fp32(
-        ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
-    wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data());
-    const auto type_cvt_functor = [&](AccDataType a) {
-        return ck::type_convert<WeiDataType, AccDataType>(a);
-    };
-    host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>(
-        wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims, type_cvt_functor);
+    // Tensor<AccDataType> wei_k_c_y_x_device_result_fp32(
+    //     ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
+    // wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data());
+    // const auto type_cvt_functor = [&](AccDataType a) {
+    //     return ck::type_convert<WeiDataType, AccDataType>(a);
+    // };
+    // host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>(
+    //     wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims,
+    //     type_cvt_functor);

    std::size_t flop = ck::utils::conv::get_flops(
        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
@@ -371,12 +403,12 @@ int main(int argc, char* argv[])
        params.filter_spatial_lengths_,
        output_spatial_lengths);

-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float tflops = static_cast<float>(flop) / 1.E9 / conv_ave_time;

-    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / conv_ave_time;

-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
+    std::cout << "Perf: conv: " << conv_ave_time << " ms, type_convert: " << type_convert_ave_time
+              << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl;

    if(do_verification)
    {
@@ -396,7 +428,7 @@ int main(int argc, char* argv[])

            ref_invoker.Run(ref_argument);

-            //wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());
+            wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());

            if(do_log)
            {

--- a/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
@@ -12,7 +12,6 @@ namespace device {

 template <typename ADataType,
          typename BDataType,
-          typename ComputeDataType,
          typename ElementwiseFunctor,
          index_t Dim,
          index_t ScalarPerVector>
@@ -63,7 +62,6 @@ struct DeviceUnaryElementwise : public BaseOperator
    using GridDesc_M0        = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
    using GridwiseBinEltwise = GridwiseUnaryElementwise_1D<ADataType,
                                                           BDataType,
-                                                            ComputeDataType,
                                                           GridDesc_M0,
                                                           ElementwiseFunctor,
                                                           ScalarPerVector>;
@@ -81,7 +79,7 @@ struct DeviceUnaryElementwise : public BaseOperator
              shape_(shape),
              functor_(functor),
              blockSize_(256),
-              gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
+              gridSize_(240) // FIXME - Calculate the grid size by number of CU in the future
        {
            a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
            b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);

--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -335,11 +335,20 @@ struct UnaryTypeConvert;
 template <>
 struct UnaryTypeConvert<float, ck::bhalf_t>
 {
-    __host__ __device__ UnaryTypeConvert(const int32_t divider = 1) { (void)divider; };
-
-    __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const { y = ck::type_convert<float, ck::bhalf_t>(x); };
+    __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
+    {
+        y = ck::type_convert<float, ck::bhalf_t>(x);
+    };
 };

+template <>
+struct UnaryTypeConvert<ck::bhalf_t, float>
+{
+    __host__ __device__ void operator()(ck::bhalf_t& y, float& x) const
+    {
+        y = ck::type_convert<ck::bhalf_t, float>(x);
+    };
+};

 } // namespace element_wise
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
@@ -18,16 +18,11 @@ __global__ void kernel_unary_elementwise_1d(const ADataType* __restrict__ p_a_gl
                                            const GridDesc_M0 b_grid_desc_m0,
                                            const ElementwiseFunctor functor)
 {
-    GridwiseUEltwise::Run(p_a_global,
-                            p_b_global,
-                            a_grid_desc_m0,
-                            b_grid_desc_m0,
-                            functor);
+    GridwiseUEltwise::Run(p_a_global, p_b_global, a_grid_desc_m0, b_grid_desc_m0, functor);
 }

 template <typename ADataType,
          typename BDataType,
-          typename ComputeDataType,
          typename GridDesc_M0,
          typename ElementwiseFunctor,
          index_t ScalarPerVector>
@@ -46,11 +41,9 @@ struct GridwiseUnaryElementwise_1D
    }

    __device__ static void Run(const ADataType* __restrict__ p_a_global,
-                               const BDataType* __restrict__ p_b_global,
-                               CDataType* __restrict__ p_c_global,
+                               BDataType* __restrict__ p_b_global,
                               const GridDesc_M0 a_grid_desc_m0,
                               const GridDesc_M0 b_grid_desc_m0,
-                               const GridDesc_M0 c_grid_desc_m0,
                               const ElementwiseFunctor functor)
    {
        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -58,14 +51,14 @@ struct GridwiseUnaryElementwise_1D
        auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_global, b_grid_desc_m0.GetElementSpaceSize());

-        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> a_thread_buf;
-        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, ScalarPerVector, true> b_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, ScalarPerVector, true> a_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, ScalarPerVector, true> b_thread_buf;

        const auto thread_store_global_offset = CalculateElementwiseIndex();

        auto a_global_load =
            ThreadwiseTensorSliceTransfer_v2<ADataType,
-                                             ComputeDataType,
+                                             ADataType,
                                             GridDesc_M0,
                                             decltype(thread_desc_m0),
                                             Sequence<ScalarPerVector>, // SliceLengths
@@ -76,7 +69,7 @@ struct GridwiseUnaryElementwise_1D
                                             false>{a_grid_desc_m0, thread_store_global_offset};

        auto b_global_write =
-            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+            ThreadwiseTensorSliceTransfer_v1r3<BDataType,
                                               BDataType,
                                               decltype(thread_desc_m0),
                                               GridDesc_M0,
@@ -92,7 +85,7 @@ struct GridwiseUnaryElementwise_1D

        const index_t blockSize    = get_block_size();
        const index_t blockPerGrid = get_grid_size();
-        const auto m0              = c_grid_desc_m0.GetLength(I0);
+        const auto m0              = b_grid_desc_m0.GetLength(I0);
        const index_t loop_step    = blockPerGrid * blockSize * ScalarPerVector;
        const auto loop_step_index = make_multi_index(loop_step);

@@ -105,8 +98,7 @@ struct GridwiseUnaryElementwise_1D

            static_for<0, ScalarPerVector, 1>{}([&](auto m) {
                constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m));
-                functor(b_thread_buf(Number<offset>{}),
-                        a_thread_buf(Number<offset>{}));
+                functor(b_thread_buf(Number<offset>{}), a_thread_buf(Number<offset>{}));
            });

            b_global_write.Run(thread_desc_m0,

--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
@@ -111,6 +111,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
    }
    else
    {
+
+        printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+               __func__,
+               grid_dim.x,
+               grid_dim.y,
+               grid_dim.z,
+               block_dim.x,
+               block_dim.y,
+               block_dim.z);
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);

        return 0;