format

e20ed766 · carlushuang · 1e95a6e2 · e20ed766 · e20ed766 · e20ed766
Commit e20ed766 authored Sep 13, 2024 by carlushuang
4 changed files
--- a/example/ck_tile/19_elementwise/elementwise.cpp
+++ b/example/ck_tile/19_elementwise/elementwise.cpp
@@ -188,7 +188,8 @@ bool test_cast(ck_tile::ArgParser args)
        ck_tile::stream_config sc{stream_};
        HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal));
-        for(int i_r = 0; i_r < repeat; i_r++) {
+        for(int i_r = 0; i_r < repeat; i_r++)
+        {
            elementwise(trait, karg, sc);
        }
        HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_));
@@ -201,8 +202,9 @@ bool test_cast(ck_tile::ArgParser args)
        HIP_CHECK_ERROR(hipEventCreate(&start_));
        HIP_CHECK_ERROR(hipEventCreate(&stop_));
-        //warm-up
+        // warm-up
-        for(int i_r = 0; i_r < warpup; i_r++) {
+        for(int i_r = 0; i_r < warpup; i_r++)
+        {
            elementwise(trait, karg, sc);
        }
        HIP_CHECK_ERROR(hipDeviceSynchronize());
@@ -225,12 +227,17 @@ bool test_cast(ck_tile::ArgParser args)
        ms = total_time / repeat;
    }
 #endif
-    auto gbps = [&](){
+    auto gbps = [&]() {
        double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType);
        return total_bytes / 1.E6 / ms;
    }();
-    printf(
+    printf("[cast] %s->%s, n:%lu,  ns:%f(ms:%f), %.2fGB/s, ",
-        "[cast] %s->%s, n:%lu,  ns:%f(ms:%f), %.2fGB/s, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms*1e6, ms, gbps);
+           input_prec.c_str(),
+           output_prec.c_str(),
+           num_pixels,
+           ms * 1e6,
+           ms,
+           gbps);
    if(ms < 0)
        printf("not supported\n");
    fflush(stdout);

--- a/example/ck_tile/19_elementwise/elementwise_api.cpp
+++ b/example/ck_tile/19_elementwise/elementwise_api.cpp
@@ -15,8 +15,8 @@ struct Cast
    using src_t   = s_type_;                                                                   \
    using dst_t   = d_type_;                                                                   \
    using u_fun   = typename impl::Cast;                                                       \
-    using problem =                                                                                \
+    using problem = ck_tile::                                                                  \
-        ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
+        ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
    using pipeline = ck_tile::ElementwiseUnaryipeline<problem>;                                \
    using kernel   = ck_tile::ElementwiseUnaryKernel<pipeline>;                                \
                                                                                               \
@@ -25,7 +25,9 @@ struct Cast
    constexpr dim3 blocks = kernel::BlockSize();                                               \
                                                                                               \
    float ave_time = ck_tile::launch_kernel(                                                   \
-        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels));   \
+        s,                                                                                     \
+        ck_tile::make_kernel<blocks.x, 1>(                                                     \
+            kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels));     \
    return ave_time;
 float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
@@ -36,49 +38,63 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
        if(t.output_type == "fp32" && t.input_type == "fp16")
        {
            constexpr int eb = sizeof(ck_tile::fp16_t);
-            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
+            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
-                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 64)
+            {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 64)
            }
-            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
-                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 128)
+            {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 128)
            }
-            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
-                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 256)
+            {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
            }
-            else if (a.num_pixels % 4 == 0) {
+            else if(a.num_pixels % 4 == 0)
-                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
+            {
+                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
+                {
                    DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256)
                }
-                else {
+                else
+                {
                    DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256)
                }
            }
-            else {
+            else
+            {
                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
            }
        }
        else if(t.output_type == "fp16" && t.input_type == "fp32")
        {
            constexpr int eb = sizeof(float);
-            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
+            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
-                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 64)
+            {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 64)
            }
-            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
-                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 128)
+            {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 128)
            }
-            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
-                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
+            {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
            }
-            else if (a.num_pixels % 4 == 0) {
+            else if(a.num_pixels % 4 == 0)
-                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
+            {
+                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
+                {
                    DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256)
                }
-                else {
+                else
+                {
                    DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256)
                }
            }
-            else {
+            else
-                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
+            {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
            }
        }
    }

--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
@@ -57,17 +57,15 @@ struct ElementwiseUnaryKernel
    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
-    CK_TILE_DEVICE void operator()(const void* p_input_,
+    CK_TILE_DEVICE void
-        void* p_output_,
+    operator()(const void* p_input_, void* p_output_, uint64_t num_pixels_) const
-        uint64_t num_pixels_) const
    {
        uint64_t block_base =
            static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize;
        uint64_t pixels_rem = num_pixels_ - block_base;
        const auto input_window = [&]() {
-            const InputType* p_input =
+            const InputType* p_input = reinterpret_cast<const InputType*>(p_input_) + block_base;
-                reinterpret_cast<const InputType*>(p_input_) + block_base;
            auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
                p_input,
@@ -79,8 +77,7 @@ struct ElementwiseUnaryKernel
        }();
        auto output_window = [&]() {
-            OutputType* p_output =
+            OutputType* p_output = reinterpret_cast<OutputType*>(p_output_) + block_base;
-                reinterpret_cast<OutputType*>(p_output_) + block_base;
            auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
                p_output,

--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
@@ -37,7 +37,8 @@ struct ElementwiseUnaryipeline
        static_for<0, Problem::Chunks, 1>{}([&](auto) {
            auto x = load_tile(inp_win);
-            auto y = make_static_distributed_tensor<typename Problem::OutputType>(x.get_tile_distribution());
+            auto y = make_static_distributed_tensor<typename Problem::OutputType>(
+                x.get_tile_distribution());
            tile_elementwise_inout(UnaryFunctor{}, y, x);
            store_tile(out_win, y);