Merge branch 'develop' into feature/check-window-lengths

e6bb1dd7 · Po Yen Chen · GitHub · 9d6a3704 · ab250afd · e6bb1dd7
Unverified Commit e6bb1dd7 authored Jul 19, 2024 by Po Yen Chen Committed by GitHub Jul 19, 2024
20 changed files
--- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
+#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
+#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
+#include "ck/host/headers.hpp"
+#include "ck/host/stringutils.hpp"
+#include "ck/host/utils.hpp"
+#include "ck/tensor_operation/gpu/device/helper.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "common.hpp"
+#include <test.hpp>
+#include <rtc/compile_kernel.hpp>
+#include <rtc/hip.hpp>
+#include <fstream>
+
+// need this for verification
+/**struct Epilogue
+{
+    Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
+                                                                          const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
+    }
+
+    float alpha_;
+    float beta_;
+};**/
+const std::string conv_compile_check = R"__ck__(
+#include <${include}>
+
+${template};
+
+)__ck__";
+
+TEST_CASE(test_problem_kernel)
+{
+    // set up problem specification
+    ck::host::conv::Problem_Conv_Fwd prob;
+    prob.NumDim = 2;
+    prob.G      = 32;
+    prob.N      = 256;
+    prob.C      = 32;
+    prob.K      = 64;
+    prob.Y      = 3;
+    prob.X      = 3;
+    prob.Hi     = 28;
+    prob.Wi     = 28;
+    prob.Ho     = 28;
+    prob.Wo     = 28;
+    check_all<ck::half_t> check;
+
+    // user provided fusion operations
+    std::string epilogue = R"(
+struct Epilogue
+{
+    __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
+                                                                          const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
+    }
+
+    float alpha_;
+    float beta_;
+};
+)";
+    std::string prologue = "";
+
+    // length+stride arrays
+    ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
+                                         static_cast<int>(prob.N),
+                                         static_cast<int>(prob.C),
+                                         static_cast<int>(prob.Hi),
+                                         static_cast<int>(prob.Wi)};
+    ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
+                                          static_cast<int>(prob.N),
+                                          static_cast<int>(prob.K),
+                                          static_cast<int>(prob.Ho),
+                                          static_cast<int>(prob.Wo)};
+    ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
+                                          static_cast<int>(prob.K),
+                                          static_cast<int>(prob.C),
+                                          static_cast<int>(prob.Y),
+                                          static_cast<int>(prob.X)};
+    ck::Array<ck::index_t, 5> d_lengths = {};
+
+    ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
+                                         static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
+                                         1,
+                                         static_cast<int>(prob.Wi * prob.G * prob.C),
+                                         static_cast<int>(prob.G * prob.C)};
+    ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
+                                          static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
+                                          1,
+                                          static_cast<int>(prob.Wo * prob.G * prob.K),
+                                          static_cast<int>(prob.G * prob.K)};
+    ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
+                                          static_cast<int>(prob.Y * prob.X * prob.C),
+                                          1,
+                                          static_cast<int>(prob.X * prob.C),
+                                          static_cast<int>(prob.C)};
+    ck::Array<ck::index_t, 5> d_strides = {};
+
+    ck::Array<ck::index_t, 2> conv_filter_strides   = {1, 1};
+    ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
+    ck::Array<ck::index_t, 2> input_left_pads       = {1, 1};
+    ck::Array<ck::index_t, 2> input_right_pads      = {1, 1};
+
+    // move the data onto the device
+    auto in_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
+    auto wei_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
+    auto out_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
+
+    // CK Verficiation: Reference Kernel
+    /**bool pass = true;
+    Tensor<ck::half_t> in_host(in_lengths, in_strides);
+    in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
+    Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
+    wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
+    Tensor<ck::half_t> out_host(out_lengths, out_strides);
+
+    std::vector<ck::index_t> conv_filter_strides_   = {1, 1};
+    std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
+    std::vector<ck::index_t> input_left_pads_       = {1, 1};
+    std::vector<ck::index_t> input_right_pads_      = {1, 1};
+
+    auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
+        2,
+        ck::half_t,
+        ck::half_t,
+        ck::half_t,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        Epilogue>();
+
+    auto ref_invoker  = ref_conv.MakeInvoker();
+    auto ref_argument = ref_conv.MakeArgument(in_host,
+                                              wei_host,
+                                              out_host,
+                                              conv_filter_strides_,
+                                              conv_filter_dilations_,
+                                              input_left_pads_,
+                                              input_right_pads_,
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              Epilogue{1.0f, 1.0f});
+    out_host.SetZero();
+    ref_invoker.Run(ref_argument);**/
+
+    for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
+    {
+        // substitute instance values into the template
+        auto src = ck::host::InterpolateString(
+            conv_compile_check,
+            {{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
+
+        auto srcs = get_headers_for_test();
+        srcs.push_back({"main.cpp", src});
+        rtc::compile_options options;
+        auto name           = solution.GetTemplateParameter<std::string>("name");
+        options.kernel_name = "run_" + name;
+        auto k              = rtc::compile_kernel(srcs, options);
+
+        // Grid size calculation
+        auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
+
+        auto tmp = get_launch_params(solution, out_lengths, out_strides);
+
+        auto grid_size = tmp * in_lengths[1];
+
+        // launch the kernel with arguments needed for the argument pointer
+        k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
+                                                              wei_dev.data(),
+                                                              out_dev.data(),
+                                                              in_lengths,
+                                                              in_strides,
+                                                              wei_lengths,
+                                                              wei_strides,
+                                                              out_lengths,
+                                                              out_strides,
+                                                              conv_filter_strides,
+                                                              conv_filter_dilations,
+                                                              input_left_pads,
+                                                              input_right_pads);
+
+        // auto res = rtc::from_gpu(out_dev);
+        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+        // assert(pass);
+
+        // Simple check: this checks that the output from each instance matches the output from the
+        // first instance
+        CHECK(report(solution, check(rtc::from_gpu(out_dev))));
+    }
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
@@ -56,6 +56,8 @@ void write_string(const std::string& filename, const std::string_view& buffer)
 }

 std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip --cuda-device-only"; }
+// TODO: undo after extracting the codeobj
+// std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip"; }

 kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options)
 {
@@ -89,6 +91,12 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options

    auto obj = read_buffer(out_path.string());

+    std::ofstream ofh("obj.o", std::ios::binary);
+    for(auto i : obj)
+        ofh << i;
+    ofh.close();
+    // int s = std::system(("/usr/bin/cp " + out_path.string() + " codeobj.bin").c_str());
+    // assert(s == 0);
    return kernel{obj.data(), options.kernel_name};
 }


--- a/codegen/test/rtc/src/hip.cpp
+++ b/codegen/test/rtc/src/hip.cpp
@@ -2,6 +2,7 @@
 #include <rtc/manage_ptr.hpp>
 #include <stdexcept>
 #include <cassert>
+#include <iostream>

 namespace rtc {

@@ -49,7 +50,10 @@ std::size_t get_available_gpu_memory()
    size_t total;
    auto status = hipMemGetInfo(&free, &total);
    if(status != hipSuccess)
-        throw std::runtime_error("Failed getting available memory: " + hip_error(status));
+    {
+        std::cerr << "Failed getting available memory: " + hip_error(status) << std::endl;
+        return (8ull * 1024ull * 1024ull * 1024ull);
+    }
    return free;
 }


--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==0.38.1
+rocm-docs-core==1.5.0
 sphinxcontrib-bibtex==2.6.2
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
 #
-# This file is autogenerated by pip-compile with Python 3.8
+# This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
 #    pip-compile requirements.in
@@ -48,12 +48,6 @@ idna==3.4
    # via requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==6.8.0
-    # via
-    #   sphinx
-    #   sphinxcontrib-bibtex
-importlib-resources==6.1.0
-    # via rocm-docs-core
 jinja2==3.1.2
    # via
    #   myst-parser
@@ -99,8 +93,6 @@ pyjwt[crypto]==2.6.0
    # via pygithub
 pynacl==1.5.0
    # via pygithub
-pytz==2023.3.post1
-    # via babel
 pyyaml==6.0
    # via
    #   myst-parser
@@ -111,7 +103,7 @@ requests==2.31.0
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.38.1
+rocm-docs-core==1.5.0
    # via -r requirements.in
 six==1.16.0
    # via
@@ -165,7 +157,3 @@ urllib3==1.26.18
    # via requests
 wrapt==1.15.0
    # via deprecated
-zipp==3.17.0
-    # via
-    #   importlib-metadata
-    #   importlib-resources
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -22,6 +22,8 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
 add_example_executable(example_gemm_xdl_fp16_v2 gemm_xdl_fp16_v2.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v2)

+add_example_executable(example_gemm_xdl_fp16_streamk_v3 gemm_xdl_fp16_streamk_v3.cpp)
+add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_streamk_v3)
 add_example_executable(example_gemm_xdl_fp16_v3 gemm_xdl_fp16_v3.cpp)
 add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v3)
 add_example_executable(example_gemm_xdl_fp8_v3 gemm_xdl_fp8_v3.cpp)

--- a/example/01_gemm/README.md
+++ b/example/01_gemm/README.md
--- a/example/01_gemm/common.hpp
+++ b/example/01_gemm/common.hpp
--- a/example/01_gemm/gemm_wmma_fp16.cpp
+++ b/example/01_gemm/gemm_wmma_fp16.cpp
--- a/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_streamk_v3.cpp
--- a/example/01_gemm/run_gemm_example.inc
+++ b/example/01_gemm/run_gemm_example.inc
@@ -159,7 +159,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
        break;
    case 4:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
        ck::utils::FillUniformDistributionIntegerValue<BDataType>{1.f, 1.f}(b_k_n);
        break;
    case 5:

--- a/example/01_gemm/run_gemm_example_streamk_v2.inc
+++ b/example/01_gemm/run_gemm_example_streamk_v2.inc
--- a/example/02_gemm_bilinear/README.md
+++ b/example/02_gemm_bilinear/README.md
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_int8.cpp
--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -24,4 +24,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
        add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_lds_direct_load_fp32)
        set(target 1)
    endif()
-endforeach()
\ No newline at end of file
+endforeach()
--- a/example/04_gemm_add_add_fastgelu/README.md
+++ b/example/04_gemm_add_add_fastgelu/README.md
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
--- a/example/09_convnd_fwd/README.md
+++ b/example/09_convnd_fwd/README.md
--- a/example/15_grouped_gemm/README.md
+++ b/example/15_grouped_gemm/README.md