1. remove comments. 2. add checkvalidity. 3. add gridsize computation

e8d6434d · wangshaojie6 · f26fb605 · e8d6434d · e8d6434d · e8d6434d
Commit e8d6434d authored Jun 10, 2022 by wangshaojie6
4 changed files
--- a/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
+++ b/example/20_convnd_bwd_weight_xdl/convnd_bwd_weight_xdl_bf16_splitk.cpp
@@ -325,7 +325,6 @@ int main(int argc, char* argv[])
    // alloc work space
    size_t bwd_weight_workspace_size = conv->GetWorkSpaceSize(argument.get());
    float conv_ave_time              = 0.f;
-    float type_convert_ave_time      = 0.f;

    DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
    wei_work_space_device_buf.SetZero();
@@ -341,42 +340,6 @@ int main(int argc, char* argv[])

    conv_ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});

-#if 0
-    // do type convert
-    auto type_convert         = DeviceUnaryElementwiseTypeConvertInstance{};
-    auto type_convert_invoker = type_convert.MakeInvokerPointer();
-    int tensor_size =
-        std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});
-    auto type_convert_argument =
-        type_convert.MakeArgumentPointer(wei_work_space_device_buf.GetDeviceBuffer(),
-                                         wei_device_buf.GetDeviceBuffer(),
-                                         {tensor_size},
-                                         {1},
-                                         {1},
-                                         UnaryTypeConvert{});
-
-    if(!type_convert.IsSupportedArgument(type_convert_argument.get()))
-    {
-        std::cout << "wrong! device_type_convert with the specified compilation parameters does "
-                     "not support this convert problem"
-                  << std::endl;
-        return 1;
-    }
-    type_convert_ave_time =
-        type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
-    // type_convert_invoker->Run(type_convert_argument.get(), StreamConfig{nullptr, time_kernel});
-#endif
-    // host code to check if conv give me a right result
-    // Tensor<AccDataType> wei_k_c_y_x_device_result_fp32(
-    //     ck::utils::conv::get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
-    // wei_work_space_device_buf.FromDevice(wei_k_c_y_x_device_result_fp32.mData.data());
-    // const auto type_cvt_functor = [&](AccDataType a) {
-    //     return ck::type_convert<WeiDataType, AccDataType>(a);
-    // };
-    // host_elementwise<Tensor<WeiDataType>, Tensor<AccDataType>, decltype(type_cvt_functor)>(
-    //     wei_k_c_y_x_device_result, wei_k_c_y_x_device_result_fp32, filter_dims,
-    //     type_cvt_functor);
-
    std::size_t flop = ck::utils::conv::get_flops(
        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
    std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
@@ -391,8 +354,8 @@ int main(int argc, char* argv[])

    float gb_per_sec = num_btype / 1.E6 / conv_ave_time;

-    std::cout << "Perf: conv: " << conv_ave_time << " ms, type_convert: " << type_convert_ave_time
-              << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl;
+    std::cout << "Perf: conv: " << conv_ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+              << " GB/s" << std::endl;

    if(do_verification)
    {

--- a/include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -1051,8 +1051,16 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
            int tensor_size =
                std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});

-            GridDesc_M0 a_grid_desc_m0_ = MakeDescriptor_M0<1>({tensor_size}, {1}, 240, 256);
-            GridDesc_M0 b_grid_desc_m0_ = MakeDescriptor_M0<1>({tensor_size}, {1}, 240, 256);
+            const index_t type_convert_grid_size = GridwiseUEltwise::CalculateGridSize(tensor_size);
+            GridDesc_M0 a_grid_desc_m0_ =
+                MakeDescriptor_M0<1>({tensor_size}, {1}, type_convert_grid_size, 256);
+            GridDesc_M0 b_grid_desc_m0_ =
+                MakeDescriptor_M0<1>({tensor_size}, {1}, type_convert_grid_size, 256);
+
+            if(!GridwiseUEltwise::CheckValidity(a_grid_desc_m0_, b_grid_desc_m0_))
+            {
+                throw std::runtime_error("wrong! GridwiseUnaryElementwise_1D has invalid setting");
+            }

            // run kernel for type conversion
            void* p_c_grid_tmp_            = static_cast<void*>(arg.p_c_grid_);
@@ -1061,7 +1069,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                float elapsed_time =
                    launch_and_time_kernel(stream_config,
                                           kernel,
-                                           dim3(240),
+                                           dim3(type_convert_grid_size),
                                           dim3(256),
                                           0,
                                           static_cast<AccDataType*>(arg.p_c_workspace_grid_),

--- a/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
@@ -60,7 +60,7 @@ struct DeviceUnaryElementwise : public BaseOperator
    }

    using GridDesc_M0      = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
-    using GridwiseBinEltwise = GridwiseUnaryElementwise_1D<ADataType,
+    using GridwiseUEltwise = GridwiseUnaryElementwise_1D<ADataType,
                                                         BDataType,
                                                         GridDesc_M0,
                                                         ElementwiseFunctor,
@@ -78,9 +78,11 @@ struct DeviceUnaryElementwise : public BaseOperator
              p_b_(p_b),
              shape_(shape),
              functor_(functor),
-              blockSize_(256),
-              gridSize_(240) // FIXME - Calculate the grid size by number of CU in the future
+              blockSize_(256) // FIXME - Calculate the grid size by number of CU in the future
        {
+            index_t tensor_size =
+                std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
+            gridSize_       = GridwiseUEltwise::CalculateGridSize(tensor_size);
            a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
            b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
        }
@@ -99,7 +101,7 @@ struct DeviceUnaryElementwise : public BaseOperator
    {
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            const auto kernel = kernel_unary_elementwise_1d<GridwiseBinEltwise,
+            const auto kernel = kernel_unary_elementwise_1d<GridwiseUEltwise,
                                                            ADataType,
                                                            BDataType,
                                                            GridDesc_M0,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp
@@ -40,6 +40,19 @@ struct GridwiseUnaryElementwise_1D
        return make_multi_index(global_thread_id * ScalarPerVector);
    }

+    __host__ __device__ static constexpr bool CheckValidity(const GridDesc_M0 a_grid_desc_m0,
+                                                            const GridDesc_M0 b_grid_desc_m0)
+    {
+        return a_grid_desc_m0.GetLength(I0) == b_grid_desc_m0.GetLength(I0);
+    }
+
+    __host__ __device__ static constexpr index_t CalculateGridSize(const index_t tensor_size)
+    {
+        const index_t grid_size = math::integer_divide_ceil(tensor_size, 256 * ScalarPerVector);
+
+        return grid_size;
+    }
+
    __device__ static void Run(const ADataType* __restrict__ p_a_global,
                               BDataType* __restrict__ p_b_global,
                               const GridDesc_M0 a_grid_desc_m0,