removed unneccesary comments, renamed files

57b9cf69 · Astha Rai · a3115568 · 57b9cf69 · 57b9cf69 · 57b9cf69
Commit 57b9cf69 authored Oct 24, 2023 by Astha Rai
8 changed files
--- a/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
+++ b/client_example/23_elementwise_transpose/elementwise_transpose_3d.cpp
@@ -45,12 +45,7 @@ int main()

    std::vector<std::size_t> ncdhw = {N, C, D, H, W};
    std::vector<std::size_t> nchwd = {N, C, H, W, D};
-    // Tensor<ADataType> a(ncdhw);
-    // Tensor<BDataType> b(nchwd);
-
-    auto size = N * C * D * H * W;
-
-    // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+    auto size                      = N * C * D * H * W;

    std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
    std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, 1, D * H, D};

--- a/example/44_elementwise_permute/CMakeLists.txt
+++ b/example/44_elementwise_permute/CMakeLists.txt
 add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
 add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
-add_example_executable(example_elementwise_permute_5D elementwise_permute_5D.cpp)
-add_example_executable(example_elementwise_permute_5D_3d elementwise_permute_5D_3d.cpp)
+add_example_executable(example_elementwise_permute elementwise_permute.cpp)
+add_example_executable(example_elementwise_permute_3d elementwise_permute_3d.cpp)
--- a/example/44_elementwise_permute/elementwise_permute_5D.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_5D.cpp
@@ -19,13 +19,13 @@ using BDataType = F16;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
-                                                        ck::Tuple<BDataType>,
-                                                        PassThrough,
-                                                        5,
-                                                        8,
-                                                        ck::Sequence<8>,
-                                                        ck::Sequence<1>>;
+    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
+                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
+                                                        PassThrough,          // ElementwiseOp
+                                                        5,                    // NumDim
+                                                        8,                    // MPerThread
+                                                        ck::Sequence<8>,  // InScalarPerVectorSeq
+                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq

 template <typename HostTensorA, typename HostTensorB, typename Functor>
 void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
@@ -103,7 +103,6 @@ int main()

    float gb_per_sec = num_btype / 1.E6 / ave_time;

-    // LogRangeAsType<float>(std::cout << "A  : ", a.mData, ",") << std::endl;
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;

@@ -115,9 +114,6 @@ int main()
        Tensor<BDataType> host_b(nchwd);
        host_elementwise4D(host_b, a, PassThrough{});

-        // LogRangeAsType<float>(std::cout << "B  : ", b.mData, ",") << std::endl;
-        // LogRangeAsType<float>(std::cout << "Host B  : ", host_b.mData, ",") << std::endl;
-
        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
    }

--- a/example/44_elementwise_permute/elementwise_permute_5D_3d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_5D_3d.cpp
@@ -19,17 +19,17 @@ using BDataType = F16;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>,
-                                                          ck::Tuple<BDataType>,
-                                                          PassThrough,
-                                                          2, // NumDim_m, {N, C}
-                                                          2, // NumDim_n, {H, W}
-                                                          1, // NumDim_k, {D}
-                                                          8,
-                                                          8,
-                                                          8,
-                                                          ck::Sequence<8>,
-                                                          ck::Sequence<8>>;
+    ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
+                                                          ck::Tuple<BDataType>, // OutDataTypeTuple
+                                                          PassThrough,          // ElementwiseOp
+                                                          2,                    // NumDim_m, {N, C}
+                                                          2,                    // NumDim_n, {H, W}
+                                                          1,                    // NumDim_k, {D}
+                                                          8,                    // MPerThread
+                                                          8,                    // NPerThread
+                                                          8,                    // KPerThread
+                                                          ck::Sequence<1>,  // InScalarPerVectorSeq
+                                                          ck::Sequence<1>>; // OutScalarPerVectorSeq

 template <typename HostTensorA, typename HostTensorB, typename Functor>
 void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
@@ -109,12 +109,8 @@ int main()
    if(do_verification)
    {
        b_device_buf.FromDevice(b.mData.data());
-
-        // LogRangeAsType<float>(std::cout << "A  : ", a.mData, ",") << std::endl;
-        // LogRangeAsType<float>(std::cout << "B  : ", b.mData, ",") << std::endl;
        Tensor<BDataType> host_b(nchwd);
        host_elementwise4D(host_b, a, PassThrough{});
-        // LogRangeAsType<float>(std::cout << "Host B  : ", host_b.mData, ",") << std::endl;

        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);

--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
@@ -19,13 +19,13 @@ using BDataType = F16;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
-                                                        ck::Tuple<BDataType>,
-                                                        PassThrough,
-                                                        4,
-                                                        8,
-                                                        ck::Sequence<8>,
-                                                        ck::Sequence<1>>;
+    ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
+                                                        ck::Tuple<BDataType>, // OutDataTypeTuple
+                                                        PassThrough,          // Elementwise op
+                                                        4,                    // NumDim
+                                                        8,                    // MPerThread
+                                                        ck::Sequence<8>,  // InScalarPerVectorSeq
+                                                        ck::Sequence<1>>; // OutScalarPerVectorSeq

 template <typename HostTensorA, typename HostTensorB, typename Functor>
 void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
@@ -106,8 +106,6 @@ int main()
        b_device_buf.FromDevice(b.mData.data());
        Tensor<BDataType> host_b(nhwc);
        host_elementwise4D(host_b, a, PassThrough{});
-        // LogRangeAsType<float>(std::cout << "A  : ", a.mData, ",") << std::endl;
-        // LogRangeAsType<float>(std::cout << "B  : ", host_b.mData, ",") << std::endl;

        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);

--- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
@@ -17,15 +17,15 @@ using BDataType = F16;

 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using DeviceElementwisePermuteInstance =
-    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
-                                                          ck::Tuple<BDataType>,
-                                                          PassThrough,
-                                                          3, // NumDim_M
-                                                          1, // NumDim_N
-                                                          1,
-                                                          1,
-                                                          ck::Sequence<1>,
-                                                          ck::Sequence<1>>;
+    ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
+                                                          ck::Tuple<BDataType>, // OutDataTypeTuple
+                                                          PassThrough,          // Elementwise op
+                                                          3,                    // NumDim_M
+                                                          1,                    // NumDim_N
+                                                          1,                    // MPerThread
+                                                          1,                    // NPerThread
+                                                          ck::Sequence<1>,  // InScalarPerVectorSeq
+                                                          ck::Sequence<1>>; // OutScalarPerVectorSeq

 template <typename HostTensorA, typename HostTensorB, typename Functor>
 void host_elementwise4D(HostTensorB& B_nhwc,
@@ -48,19 +48,10 @@ int main()
    bool do_verification = true;
    bool time_kernel     = true;

-    // const int N = 120;
-    // const int C = 128;
-    // const int H = 32;
-    // const int W = 1024;
-    const int N = 16;
-    const int C = 8;
+    const int N = 120;
+    const int C = 128;
    const int H = 32;
-    const int W = 64;
-    /**const int N = 120;
-    const int H = 32;
-    const int W = 64;
-
-    const int C = 128;**/
+    const int W = 1024;

    std::vector<std::size_t> nchw = {N, C, H, W};
    std::vector<std::size_t> nhwc = {N, H, W, C};
@@ -74,7 +65,6 @@ int main()
    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());

    a_device_buf.ToDevice(a.mData.data());
-    // LogRangeAsType<float>(std::cout << "Tensor a  : ", a.mData, ",") << std::endl;

    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
@@ -110,25 +100,18 @@ int main()

    float gb_per_sec = num_btype / 1.E6 / ave_time;

-    // LogRangeAsType<float>(std::cout << "A  : ", a.mData, ",") << std::endl;
    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;

    bool pass = true;

-    // LogRangeAsType<float>(std::cout << "B  : ", b.mData, ",") << std::endl;
    if(do_verification)
    {
        b_device_buf.FromDevice(b.mData.data());
-        // LogRangeAsType<float>(std::cout << "Tensor b  : ", b.mData, ",") << std::endl;

        Tensor<BDataType> host_b(nhwc);
        host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
            host_b, a, nchw, PassThrough{});
-        // LogRangeAsType<float>(std::cout << "Host_b  : ", host_b.mData, ",") << std::endl;
-
-        // LogRangeAsType<float>(std::cout << "B  : ", b.mData, ",") << std::endl;
-        // LogRangeAsType<float>(std::cout << "Host b  : ", host_b.mData, ",") << std::endl;
        pass &=
            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
    }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_3d.hpp
@@ -143,8 +143,9 @@ struct GridwiseElementwise_3D
                    Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
                    Sequence<0, 1, 2>,                            // DimAccessOrder
                    0,                                            // SrcVectorDim
-                    1, // InScalarPerVectorSeq::At(I),                  // ScalarPerVector
-                    1, // SrcScalarStrideInVector
+                    InScalarPerVectorSeq::At(I), // InScalarPerVectorSeq::At(I),                  //
+                                                 // ScalarPerVector
+                    1,                           // SrcScalarStrideInVector
                    true>{in_grid_3d_desc_tuple[I], thread_global_offset};
            },
            Number<NumInput>{});
@@ -163,7 +164,7 @@ struct GridwiseElementwise_3D
                    Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
                    Sequence<0, 1, 2>,                            // DimAccessOrder
                    1,                                            // SrcVectorDim
-                    1,                                            // OutScalarPerVectorSeq::At(I),
+                    OutScalarPerVectorSeq::At(I),                 // OutScalarPerVectorSeq::At(I),
                    InMemoryDataOperationEnum::Set,
                    1,
                    true>(out_grid_3d_desc_tuple[I], thread_global_offset, PassThroughOp{});

--- a/library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/transpose_3d.hpp
@@ -42,16 +42,18 @@ struct DeviceOperationInstanceFactory<
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-        if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+        if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
+                     is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
        {
            add_device_transpose_f32_instances(op_ptrs);
        }
-        else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+        else if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
+                          is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
        {
            add_device_transpose_f16_instances(op_ptrs);
        }
+        return op_ptrs;
    }
-    return op_ptrs;
 };

 } // namespace instance