update script

1e95a6e2 · carlushuang · 50ba9c44 · 1e95a6e2 · 1e95a6e2 · 1e95a6e2
Commit 1e95a6e2 authored Sep 13, 2024 by carlushuang
4 changed files
--- a/example/ck_tile/19_elementwise/elementwise.cpp
+++ b/example/ck_tile/19_elementwise/elementwise.cpp
@@ -17,6 +17,10 @@
 #define TEST_ELEMENTWISE_VERBOSE 1
 #endif

+#ifndef TEST_ELEMENTWISE_HIPGRAPH
+#define TEST_ELEMENTWISE_HIPGRAPH 1
+#endif
+
 template <typename T>
 void dump_host_tensor_2d(const ck_tile::HostTensor<T>& x)
 {
@@ -149,6 +153,13 @@ bool test_cast(ck_tile::ArgParser args)
        t_.input_type  = input_prec;
        t_.output_type = output_prec;
        t_.op          = std::string("cast");
+        t_.num_cu = [&]() {
+                hipDeviceProp_t dev_prop;
+                hipDevice_t dev;
+                HIP_CHECK_ERROR(hipGetDevice(&dev));
+                HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+                return dev_prop.multiProcessorCount;
+            }();
        return t_;
    }();

@@ -161,11 +172,65 @@ bool test_cast(ck_tile::ArgParser args)
    }();

 #if TEST_ELEMENTWISE_VERBOSE
-    ck_tile::stream_config sc{nullptr, true};
+#if !TEST_ELEMENTWISE_HIPGRAPH
+    ck_tile::stream_config sc{nullptr, true, 0, 20, 50, false};
    // ck_tile::stream_config sc{nullptr};
    auto ms = elementwise(trait, karg, sc);
+#else
+    float ms = 0;
+    {
+        int repeat = 50;
+        int warpup = 20;
+        hipGraph_t graph_;
+        hipStream_t stream_;
+
+        HIP_CHECK_ERROR(hipStreamCreate(&stream_));
+        ck_tile::stream_config sc{stream_};
+
+        HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal));
+        for(int i_r = 0; i_r < repeat; i_r++) {
+            elementwise(trait, karg, sc);
+        }
+        HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_));
+
+        hipGraphExec_t instance_;
+        HIP_CHECK_ERROR(hipGraphInstantiate(&instance_, graph_, nullptr, nullptr, 0));
+
+        hipEvent_t start_, stop_;
+
+        HIP_CHECK_ERROR(hipEventCreate(&start_));
+        HIP_CHECK_ERROR(hipEventCreate(&stop_));
+
+        //warm-up
+        for(int i_r = 0; i_r < warpup; i_r++) {
+            elementwise(trait, karg, sc);
+        }
+        HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+        HIP_CHECK_ERROR(hipEventRecord(start_, sc.stream_id_));
+
+        HIP_CHECK_ERROR(hipGraphLaunch(instance_, sc.stream_id_));
+
+        HIP_CHECK_ERROR(hipEventRecord(stop_, sc.stream_id_));
+        HIP_CHECK_ERROR(hipEventSynchronize(stop_));
+
+        HIP_CHECK_ERROR(hipGetLastError());
+
+        HIP_CHECK_ERROR(hipGraphDestroy(graph_));
+
+        float total_time = 0;
+
+        HIP_CHECK_ERROR(hipEventElapsedTime(&total_time, start_, stop_));
+
+        ms = total_time / repeat;
+    }
+#endif
+    auto gbps = [&](){
+        double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType);
+        return total_bytes / 1.E6 / ms;
+    }();
    printf(
-        "[cast] %s->%s, n:%lu,  ms:%f, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms);
+        "[cast] %s->%s, n:%lu,  ns:%f(ms:%f), %.2fGB/s, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms*1e6, ms, gbps);
    if(ms < 0)
        printf("not supported\n");
    fflush(stdout);

--- a/example/ck_tile/19_elementwise/elementwise_api.cpp
+++ b/example/ck_tile/19_elementwise/elementwise_api.cpp
@@ -11,12 +11,12 @@ struct Cast
 };
 } // namespace impl

-#define DISPATCH_ELEMENTWISE_CAST(d_type_, s_type_, byte_per_issue_, chunks_)                      \
+#define DISPATCH_E_CAST_(d_type_, s_type_, byte_per_issue_, chunks_, bs_)                      \
    using src_t = s_type_;                                                                         \
    using dst_t = d_type_;                                                                         \
    using u_fun = typename impl::Cast;                                                             \
    using problem =                                                                                \
-        ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_>; \
+        ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
    using pipeline = ck_tile::ElementwiseUnaryipeline<problem>;                                    \
    using kernel   = ck_tile::ElementwiseUnaryKernel<pipeline>;                                    \
                                                                                                   \
@@ -25,7 +25,7 @@ struct Cast
    constexpr dim3 blocks = kernel::BlockSize();                                                   \
                                                                                                   \
    float ave_time = ck_tile::launch_kernel(                                                       \
-        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels));                  \
+        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels));   \
    return ave_time;

 float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
@@ -35,11 +35,51 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
    {
        if(t.output_type == "fp32" && t.input_type == "fp16")
        {
-            DISPATCH_ELEMENTWISE_CAST(float, ck_tile::fp16_t, 8*sizeof(ck_tile::fp16_t), 8)
+            constexpr int eb = sizeof(ck_tile::fp16_t);
+            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 64)
+            }
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 128)
+            }
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 256)
+            }
+            else if (a.num_pixels % 4 == 0) {
+                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
+                    DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256)
+                }
+                else {
+                    DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256)
+                }
+            }
+            else {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
+            }
        }
        else if(t.output_type == "fp16" && t.input_type == "fp32")
        {
-            DISPATCH_ELEMENTWISE_CAST(ck_tile::fp16_t, float, 4*sizeof(float), 8)
+            constexpr int eb = sizeof(float);
+            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 64)
+            }
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 128)
+            }
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
+            }
+            else if (a.num_pixels % 4 == 0) {
+                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
+                    DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256)
+                }
+                else {
+                    DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256)
+                }
+            }
+            else {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
+            }
        }
    }
    return rtn;

--- a/example/ck_tile/19_elementwise/elementwise_api.hpp
+++ b/example/ck_tile/19_elementwise/elementwise_api.hpp
@@ -13,6 +13,7 @@ struct elementwise_trait
    std::string acc_type;    // type to do intermediate computation
    std::string output_type; // type to store out
    std::string op;
+    int num_cu;
 };

 struct elementwise_kargs : public ck_tile::ElementwiseUnaryHostArgs

--- a/example/ck_tile/19_elementwise/script/bench.sh
+++ b/example/ck_tile/19_elementwise/script/bench.sh
+#!/bin/sh
+
+EXE=./build/bin/tile_example_elementwise
+
+$EXE -pr_i=fp16 -pr_o=fp32 -n=2043904
+$EXE -pr_i=fp16 -pr_o=fp32 -n=992256
+$EXE -pr_i=fp16 -pr_o=fp32 -n=846304
+$EXE -pr_i=fp16 -pr_o=fp32 -n=434176
+$EXE -pr_i=fp16 -pr_o=fp32 -n=159424
+$EXE -pr_i=fp16 -pr_o=fp32 -n=98304
+$EXE -pr_i=fp16 -pr_o=fp32 -n=73728
+$EXE -pr_i=fp16 -pr_o=fp32 -n=17408
+$EXE -pr_i=fp16 -pr_o=fp32 -n=512
+$EXE -pr_i=fp16 -pr_o=fp32 -n=256
+
+echo "-------------------------------------"
+
+$EXE -pr_i=fp32 -pr_o=fp16 -n=2043904
+$EXE -pr_i=fp32 -pr_o=fp16 -n=992256
+$EXE -pr_i=fp32 -pr_o=fp16 -n=846304
+$EXE -pr_i=fp32 -pr_o=fp16 -n=434176
+$EXE -pr_i=fp32 -pr_o=fp16 -n=159424
+$EXE -pr_i=fp32 -pr_o=fp16 -n=98304
+$EXE -pr_i=fp32 -pr_o=fp16 -n=73728
+$EXE -pr_i=fp32 -pr_o=fp16 -n=17408
+$EXE -pr_i=fp32 -pr_o=fp16 -n=512
+$EXE -pr_i=fp32 -pr_o=fp16 -n=256