Merge branch 'develop' into amd-develop

8f41bd8e · Jun Liu · 7f65ac05 · d7f05fb9 · 8f41bd8e · 8f41bd8e
Commit 8f41bd8e authored Apr 11, 2024 by Jun Liu
4 changed files
--- a/profiler/src/profile_contraction_scale.cpp
+++ b/profiler/src/profile_contraction_scale.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
@@ -19,7 +19,8 @@ static void print_helper_msg()
    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
              << "arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
              << "arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
-              << "arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
+              << "arg4: Number of dimension for M, N and K (one for all)\n"
+              << "arg5: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
              << "                     1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
@@ -27,22 +28,22 @@ static void print_helper_msg()
                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
              << "                     3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
                 "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
-              << "arg5: verification (0: no; 1: yes)\n"
+              << "arg6: verification (0: no; 1: yes)\n"
-              << "arg6: initialization (0: no init; 1: integer value; 2: decimal "
+              << "arg7: initialization (0: no init; 1: integer value; 2: decimal "
              << "value)\n"
-              << "arg7: print tensor value (0: no; 1: yes)\n"
+              << "arg8: print tensor value (0: no; 1: yes)\n"
-              << "arg8: time kernel (0: no, 1: yes)\n"
+              << "arg9: time kernel (0: no, 1: yes)\n"
-              << "arg9: alpha\n"
+              << "arg10: alpha\n"
-              << "arg10 to 15: M0, M1, N0, N1, K0, K1\n"
+              << "arg11 to 16/28: M0, M1, N0, N1, K0, K1\n"
-              << "arg16 to 31: Strides for A, B, D and E (skip for default)\n"
+              << "arg17/29 to 32/63: Strides for A, B, E (skip for default)\n"
              << std::endl;
 }
 int profile_contraction_scale(int argc, char* argv[])
 {
-    const bool default_strides = argc == 16;
+    const bool default_strides = argc == 17 || argc == 29;
-    if(argc != 32 && argc != 16)
+    if(argc != 29 && argc != 65 && !default_strides)
    {
        print_helper_msg();
        exit(1);
@@ -50,31 +51,30 @@ int profile_contraction_scale(int argc, char* argv[])
    const auto data_type          = static_cast<ContractionDataType>(std::stoi(argv[2]));
    const auto compute_data_type  = static_cast<ContractionComputeDataType>(std::stoi(argv[3]));
-    const auto layout             = static_cast<ContractionMatrixLayout>(std::stoi(argv[4]));
+    const ck::index_t NumDimMNK   = std::stoi(argv[4]);
-    const bool do_verification    = std::stoi(argv[5]);
+    const auto layout             = static_cast<ContractionMatrixLayout>(std::stoi(argv[5]));
-    const ck::index_t init_method = std::stoi(argv[6]);
+    const bool do_verification    = std::stoi(argv[6]);
-    const bool do_log             = std::stoi(argv[7]);
+    const ck::index_t init_method = std::stoi(argv[7]);
-    const bool time_kernel        = std::stoi(argv[8]);
+    const bool do_log             = std::stoi(argv[8]);
-    const float alpha             = std::stof(argv[9]);
+    const bool time_kernel        = std::stoi(argv[9]);
+    const float alpha             = std::stof(argv[10]);
    std::vector<ck::index_t> M;
    std::vector<ck::index_t> N;
    std::vector<ck::index_t> K;
-    const ck::index_t dims_arg_num = 10;
+    const ck::index_t dims_arg_num = 11;
-    collect_index_params(argv, M, dims_arg_num, 2);
+    collect_index_params(argv, M, dims_arg_num, NumDimMNK);
-    collect_index_params(argv, N, dims_arg_num + 2, 2);
+    collect_index_params(argv, N, dims_arg_num + NumDimMNK, NumDimMNK);
-    collect_index_params(argv, K, dims_arg_num + 4, 2);
+    collect_index_params(argv, K, dims_arg_num + NumDimMNK * 2, NumDimMNK);
-    std::vector<ck::index_t> StridesA;
+    std::vector<ck::index_t> StridesA(NumDimMNK * 2);
-    std::vector<ck::index_t> StridesB;
+    std::vector<ck::index_t> StridesB(NumDimMNK * 2);
-    std::vector<ck::index_t> StridesE;
+    std::vector<ck::index_t> StridesE(NumDimMNK * 2);
-    std::vector<ck::index_t> StridesD;
    if(!default_strides)
    {
-        collect_index_params(argv, StridesA, dims_arg_num + 6, 4);
+        collect_index_params(argv, StridesA, dims_arg_num + NumDimMNK * 3, NumDimMNK * 2);
-        collect_index_params(argv, StridesB, dims_arg_num + 10, 4);
+        collect_index_params(argv, StridesB, dims_arg_num + NumDimMNK * 5, NumDimMNK * 2);
-        collect_index_params(argv, StridesE, dims_arg_num + 14, 4);
+        collect_index_params(argv, StridesE, dims_arg_num + NumDimMNK * 7, NumDimMNK * 2);
-        collect_index_params(argv, StridesD, dims_arg_num + 18, 4);
    }
    using F16  = ck::half_t;
@@ -93,32 +93,71 @@ int profile_contraction_scale(int argc, char* argv[])
            if(default_strides)
            {
-                assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]});
+                auto merge_dims = [](const std::vector<ck::index_t>& dims01,
-                assign_default_strides(b_layout, StridesB, {N[0], N[1], K[0], K[1]});
+                                     const std::vector<ck::index_t>& dims23) {
-                assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]});
+                    std::vector<ck::index_t> dims_szt(dims01.begin(), dims01.end());
-                assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]});
+                    dims_szt.insert(dims_szt.end(), dims23.begin(), dims23.end());
+                    return dims_szt;
+                };
+                assign_default_strides(a_layout, StridesA, merge_dims(M, K));
+                assign_default_strides(b_layout, StridesB, merge_dims(N, K));
+                assign_default_strides(cde_layout, StridesE, merge_dims(M, N));
            }
-            bool pass = ck::profiler::profile_contraction_impl<ALayout,
+            if(NumDimMNK == 2)
-                                                               BLayout,
+            {
-                                                               CDELayout,
+                bool pass = ck::profiler::profile_contraction_impl<2,
-                                                               DataType,
+                                                                   ALayout,
-                                                               ComputeDataType,
+                                                                   BLayout,
-                                                               ck::Tuple<>,
+                                                                   CDELayout,
-                                                               Scale>(do_verification,
+                                                                   DataType,
-                                                                      init_method,
+                                                                   ComputeDataType,
-                                                                      do_log,
+                                                                   ck::Tuple<>,
-                                                                      time_kernel,
+                                                                   Scale>(do_verification,
-                                                                      Scale{alpha},
+                                                                          init_method,
-                                                                      M,
+                                                                          do_log,
-                                                                      N,
+                                                                          time_kernel,
-                                                                      K,
+                                                                          Scale{alpha},
-                                                                      StridesA,
+                                                                          M,
-                                                                      StridesB,
+                                                                          N,
-                                                                      StridesE,
+                                                                          K,
-                                                                      StridesD);
+                                                                          StridesA,
+                                                                          StridesB,
-            return pass;
+                                                                          StridesE,
+                                                                          StridesE);
+                return pass;
+            }
+            else if(NumDimMNK == 6)
+            {
+                bool pass = ck::profiler::profile_contraction_impl<6,
+                                                                   ALayout,
+                                                                   BLayout,
+                                                                   CDELayout,
+                                                                   DataType,
+                                                                   ComputeDataType,
+                                                                   ck::Tuple<>,
+                                                                   Scale>(do_verification,
+                                                                          init_method,
+                                                                          do_log,
+                                                                          time_kernel,
+                                                                          Scale{alpha},
+                                                                          M,
+                                                                          N,
+                                                                          K,
+                                                                          StridesA,
+                                                                          StridesB,
+                                                                          StridesE,
+                                                                          StridesE);
+                return pass;
+            }
+            else
+            {
+                throw std::runtime_error("Not supported NumDimMNK");
+                return false;
+            }
        };
    auto run_profile_for_datatype = [&](auto type, auto compute_type) {

--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -26,6 +26,7 @@ enum struct ConvDataType
    F8_F8_F8,       // 4
    BF8_BF8_F8,     // 5
    F8_BF8_F8,      // 6
+    BF8_F8_F8,      // 7
 };
 #define OP_NAME "grouped_conv_fwd"
@@ -42,7 +43,8 @@ static void print_helper_msg()
        << "                 3: Input int8, Weight int8, Output int8\n"
        << "                 4: Input fp8, Weight fp8, Output fp8\n"
        << "                 5: Input bf8, Weight bf8, Output fp8\n"
-        << "                 6: Input fp8, Weight bf8, Output fp8)\n"
+        << "                 6: Input fp8, Weight bf8, Output fp8\n"
+        << "                 7: Input bf8, Weight fp8, Output fp8)\n"
        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
        << "arg4: verification (0: no, 1: yes)\n"
@@ -281,6 +283,10 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        {
            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, BF8{}, F8{}, F8{}, BF8{});
        }
+        else if(data_type == ConvDataType::BF8_F8_F8)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF8{}, F8{}, F8{}, BF8{}, F8{});
+        }
    }
    std::cout << "this data_type & layout is not implemented" << std::endl;

--- a/test/contraction/test_contraction_interface_xdl.cpp
+++ b/test/contraction/test_contraction_interface_xdl.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <stdexcept>
 #include <vector>
@@ -125,18 +125,6 @@ class ContractionDeviceOpWrapper
    }
 };
-TEST(TestContractionInterface, IncorrectNumDims)
-{
-    std::vector<std::vector<ck::index_t>> Dims    = {{4, 4}, {4, 4, 4, 4}, {4, 4, 4, 4, 4, 4}};
-    std::vector<std::vector<ck::index_t>> Strides = {{1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}};
-    ContractionDeviceOpWrapper<F32, F32, F32, F32, 1> wrapper_1d;
-    ContractionDeviceOpWrapper<F32, F32, F32, F32, 2> wrapper_2d;
-    ContractionDeviceOpWrapper<F32, F32, F32, F32, 3> wrapper_3d;
-    EXPECT_FALSE(wrapper_1d.IsSupportedInstance(Dims[0], Strides[0]));
-    EXPECT_TRUE(wrapper_2d.IsSupportedInstance(Dims[1], Strides[1]));
-    EXPECT_FALSE(wrapper_3d.IsSupportedInstance(Dims[2], Strides[2]));
-}
 TEST(TestContractionInterface, IncorrectDataTypes)
 {
    std::vector<ck::index_t> Dims    = {4, 4, 4, 4};

--- a/test/contraction/test_contraction_xdl.cpp
+++ b/test/contraction/test_contraction_xdl.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <cstdlib>
 #include <iostream>
@@ -23,8 +23,11 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 using Bilinear = ck::tensor_operation::element_wise::Bilinear;
 using Scale    = ck::tensor_operation::element_wise::Scale;
+template <ck::index_t NDims>
 struct Dimensions
 {
+    constexpr static ck::index_t NumDimMNK = NDims;
    std::vector<ck::index_t> M;
    std::vector<ck::index_t> N;
    std::vector<ck::index_t> K;
@@ -42,53 +45,58 @@ class TestContraction : public ::testing::Test
    using ComputeDataType = std::tuple_element_t<5, Tuple>;
    using CDElementOp     = std::tuple_element_t<6, Tuple>;
-    std::vector<Dimensions> dimension_list = {{{32, 32}, {32, 32}, {32, 32}},
-                                              {{16, 16}, {32, 32}, {16, 16}}};
    std::vector<ck::index_t> init_methods = {1, 2};
    std::unique_ptr<CDElementOp> p_cd_element_op;
-    void Run()
+    template <ck::index_t NumDim>
+    void Run(Dimensions<NumDim> dimension_params)
    {
-        for(auto& dimension_params : dimension_list)
+        constexpr ck::index_t NumDimMNK = ck::remove_cvref_t<decltype(dimension_params)>::NumDimMNK;
+        std::vector<ck::index_t> StridesA(2 * NumDim);
+        std::vector<ck::index_t> StridesB(2 * NumDim);
+        std::vector<ck::index_t> StridesC(2 * NumDim);
+        std::vector<ck::index_t> StridesD(2 * NumDim);
+        const auto& M = dimension_params.M;
+        const auto& N = dimension_params.N;
+        const auto& K = dimension_params.K;
+        auto merge_dims = [](const std::vector<ck::index_t>& dims01,
+                             const std::vector<ck::index_t>& dims23) {
+            std::vector<ck::index_t> dims_szt(dims01.begin(), dims01.end());
+            dims_szt.insert(dims_szt.end(), dims23.begin(), dims23.end());
+            return dims_szt;
+        };
+        assign_default_strides(ALayout{}, StridesA, merge_dims(M, K));
+        assign_default_strides(BLayout{}, StridesB, merge_dims(N, K));
+        assign_default_strides(CDLayout{}, StridesC, merge_dims(M, N));
+        assign_default_strides(CDLayout{}, StridesD, merge_dims(M, N));
+        for(const ck::index_t init_method : init_methods)
        {
-            std::vector<ck::index_t> StridesA;
+            bool pass =
-            std::vector<ck::index_t> StridesB;
+                ck::profiler::profile_contraction_impl<NumDimMNK,
-            std::vector<ck::index_t> StridesC;
+                                                       ALayout,
-            std::vector<ck::index_t> StridesD;
+                                                       BLayout,
+                                                       CDLayout,
-            const auto& M = dimension_params.M;
+                                                       DataType,
-            const auto& N = dimension_params.N;
+                                                       ComputeDataType,
-            const auto& K = dimension_params.K;
+                                                       DTupleDataType,
+                                                       CDElementOp>(true /*do_verification*/,
-            assign_default_strides(ALayout{}, StridesA, {M[0], M[1], K[0], K[1]});
+                                                                    init_method,
-            assign_default_strides(BLayout{}, StridesB, {N[0], N[1], K[0], K[1]});
+                                                                    false /*do_logs*/,
-            assign_default_strides(CDLayout{}, StridesC, {M[0], M[1], N[0], N[1]});
+                                                                    false /*time_kernel*/,
-            assign_default_strides(CDLayout{}, StridesD, {M[0], M[1], N[0], N[1]});
+                                                                    *p_cd_element_op,
+                                                                    dimension_params.M,
-            for(const ck::index_t init_method : init_methods)
+                                                                    dimension_params.N,
-            {
+                                                                    dimension_params.K,
-                bool pass =
+                                                                    StridesA,
-                    ck::profiler::profile_contraction_impl<ALayout,
+                                                                    StridesB,
-                                                           BLayout,
+                                                                    StridesC,
-                                                           CDLayout,
+                                                                    StridesD);
-                                                           DataType,
+            EXPECT_TRUE(pass);
-                                                           ComputeDataType,
-                                                           DTupleDataType,
-                                                           CDElementOp>(true /*do_verification*/,
-                                                                        init_method,
-                                                                        false /*do_logs*/,
-                                                                        false /*time_kernel*/,
-                                                                        *p_cd_element_op,
-                                                                        dimension_params.M,
-                                                                        dimension_params.N,
-                                                                        dimension_params.K,
-                                                                        StridesA,
-                                                                        StridesB,
-                                                                        StridesC,
-                                                                        StridesD);
-                EXPECT_TRUE(pass);
-            }
        }
    }
 };
@@ -122,17 +130,31 @@ TYPED_TEST_SUITE(TestContractionScale, ScaleKernelTypes);
 TYPED_TEST(TestContractionBilinear, bilinear)
 {
    this->p_cd_element_op = std::make_unique<Bilinear>(1.f, 1.f);
-    this->Run();
+    this->template Run<6>({{2, 3, 2, 3, 2, 3}, {2, 3, 2, 3, 2, 3}, {2, 2, 2, 2, 2, 4}});
+    this->template Run<6>({{1, 1, 1, 3, 2, 3}, {1, 1, 1, 3, 2, 3}, {1, 1, 1, 2, 2, 4}});
+    this->template Run<2>({{16, 8}, {16, 8}, {16, 8}});
+    this->template Run<2>({{8, 16}, {16, 8}, {8, 16}});
    this->p_cd_element_op = std::make_unique<Bilinear>(-0.5f, 0.5f);
-    this->Run();
+    this->template Run<6>({{2, 3, 2, 3, 2, 3}, {2, 3, 2, 3, 2, 3}, {2, 2, 2, 2, 2, 4}});
+    this->template Run<6>({{1, 1, 1, 3, 2, 3}, {1, 1, 1, 3, 2, 3}, {1, 1, 1, 2, 2, 4}});
+    this->template Run<2>({{16, 8}, {16, 8}, {16, 8}});
+    this->template Run<2>({{8, 16}, {16, 8}, {8, 16}});
 }
 TYPED_TEST(TestContractionScale, scale)
 {
    this->p_cd_element_op = std::make_unique<Scale>(1.f);
-    this->Run();
+    this->template Run<6>({{2, 3, 2, 3, 2, 3}, {2, 3, 2, 3, 2, 3}, {2, 2, 2, 2, 2, 4}});
+    this->template Run<6>({{1, 1, 1, 3, 2, 3}, {1, 1, 1, 3, 2, 3}, {1, 1, 1, 2, 2, 4}});
+    this->template Run<2>({{16, 8}, {16, 8}, {16, 8}});
+    this->template Run<2>({{8, 16}, {16, 8}, {8, 16}});
    this->p_cd_element_op = std::make_unique<Scale>(0.5f);
-    this->Run();
+    this->template Run<6>({{2, 3, 2, 3, 2, 3}, {2, 3, 2, 3, 2, 3}, {2, 2, 2, 2, 2, 4}});
+    this->template Run<6>({{1, 1, 1, 3, 2, 3}, {1, 1, 1, 3, 2, 3}, {1, 1, 1, 2, 2, 4}});
+    this->template Run<2>({{16, 8}, {16, 8}, {16, 8}});
+    this->template Run<2>({{8, 16}, {16, 8}, {8, 16}});
 }
 template <typename Tuple>
@@ -165,15 +187,29 @@ TYPED_TEST_SUITE(TestContractionScaleMixedPrecision, ScaleKernelTypesMixedPrecis
 TYPED_TEST(TestContractionBilinearMixedPrecision, bilinear)
 {
    this->p_cd_element_op = std::make_unique<Bilinear>(1.f, 1.f);
-    this->Run();
+    this->template Run<6>({{2, 3, 2, 3, 2, 3}, {2, 3, 2, 3, 2, 3}, {2, 2, 2, 2, 2, 4}});
+    this->template Run<6>({{1, 1, 1, 3, 2, 3}, {1, 1, 1, 3, 2, 3}, {1, 1, 1, 2, 2, 4}});
+    this->template Run<2>({{16, 8}, {16, 8}, {16, 8}});
+    this->template Run<2>({{8, 16}, {16, 8}, {8, 16}});
    this->p_cd_element_op = std::make_unique<Bilinear>(-0.5f, 0.5f);
-    this->Run();
+    this->template Run<6>({{2, 3, 2, 3, 2, 3}, {2, 3, 2, 3, 2, 3}, {2, 2, 2, 2, 2, 4}});
+    this->template Run<6>({{1, 1, 1, 3, 2, 3}, {1, 1, 1, 3, 2, 3}, {1, 1, 1, 2, 2, 4}});
+    this->template Run<2>({{16, 8}, {16, 8}, {16, 8}});
+    this->template Run<2>({{8, 16}, {16, 8}, {8, 16}});
 }
 TYPED_TEST(TestContractionScaleMixedPrecision, scale)
 {
    this->p_cd_element_op = std::make_unique<Scale>(1.f);
-    this->Run();
+    this->template Run<6>({{2, 3, 2, 3, 2, 3}, {2, 3, 2, 3, 2, 3}, {2, 2, 2, 2, 2, 4}});
+    this->template Run<6>({{1, 1, 1, 3, 2, 3}, {1, 1, 1, 3, 2, 3}, {1, 1, 1, 2, 2, 4}});
+    this->template Run<2>({{16, 8}, {16, 8}, {16, 8}});
+    this->template Run<2>({{8, 16}, {16, 8}, {8, 16}});
    this->p_cd_element_op = std::make_unique<Scale>(0.5f);
-    this->Run();
+    this->template Run<6>({{2, 3, 2, 3, 2, 3}, {2, 3, 2, 3, 2, 3}, {2, 2, 2, 2, 2, 4}});
+    this->template Run<6>({{1, 1, 1, 3, 2, 3}, {1, 1, 1, 3, 2, 3}, {1, 1, 1, 2, 2, 4}});
+    this->template Run<2>({{16, 8}, {16, 8}, {16, 8}});
+    this->template Run<2>({{8, 16}, {16, 8}, {8, 16}});
 }