format

e20ed766 · carlushuang · 1e95a6e2 · e20ed766 · e20ed766 · e20ed766
Commit e20ed766 authored Sep 13, 2024 by carlushuang
5 changed files
--- a/example/ck_tile/19_elementwise/elementwise.cpp
+++ b/example/ck_tile/19_elementwise/elementwise.cpp
@@ -153,13 +153,13 @@ bool test_cast(ck_tile::ArgParser args)
        t_.input_type  = input_prec;
        t_.output_type = output_prec;
        t_.op          = std::string("cast");
-        t_.num_cu = [&]() {
+        t_.num_cu      = [&]() {
-                hipDeviceProp_t dev_prop;
+            hipDeviceProp_t dev_prop;
-                hipDevice_t dev;
+            hipDevice_t dev;
-                HIP_CHECK_ERROR(hipGetDevice(&dev));
+            HIP_CHECK_ERROR(hipGetDevice(&dev));
-                HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+            HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
-                return dev_prop.multiProcessorCount;
+            return dev_prop.multiProcessorCount;
-            }();
+        }();
        return t_;
    }();
@@ -188,7 +188,8 @@ bool test_cast(ck_tile::ArgParser args)
        ck_tile::stream_config sc{stream_};
        HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal));
-        for(int i_r = 0; i_r < repeat; i_r++) {
+        for(int i_r = 0; i_r < repeat; i_r++)
+        {
            elementwise(trait, karg, sc);
        }
        HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_));
@@ -201,8 +202,9 @@ bool test_cast(ck_tile::ArgParser args)
        HIP_CHECK_ERROR(hipEventCreate(&start_));
        HIP_CHECK_ERROR(hipEventCreate(&stop_));
-        //warm-up
+        // warm-up
-        for(int i_r = 0; i_r < warpup; i_r++) {
+        for(int i_r = 0; i_r < warpup; i_r++)
+        {
            elementwise(trait, karg, sc);
        }
        HIP_CHECK_ERROR(hipDeviceSynchronize());
@@ -225,12 +227,17 @@ bool test_cast(ck_tile::ArgParser args)
        ms = total_time / repeat;
    }
 #endif
-    auto gbps = [&](){
+    auto gbps = [&]() {
        double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType);
        return total_bytes / 1.E6 / ms;
    }();
-    printf(
+    printf("[cast] %s->%s, n:%lu,  ns:%f(ms:%f), %.2fGB/s, ",
-        "[cast] %s->%s, n:%lu,  ns:%f(ms:%f), %.2fGB/s, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms*1e6, ms, gbps);
+           input_prec.c_str(),
+           output_prec.c_str(),
+           num_pixels,
+           ms * 1e6,
+           ms,
+           gbps);
    if(ms < 0)
        printf("not supported\n");
    fflush(stdout);

--- a/example/ck_tile/19_elementwise/elementwise_api.cpp
+++ b/example/ck_tile/19_elementwise/elementwise_api.cpp
@@ -12,20 +12,22 @@ struct Cast
 } // namespace impl
 #define DISPATCH_E_CAST_(d_type_, s_type_, byte_per_issue_, chunks_, bs_)                      \
-    using src_t = s_type_;                                                                         \
+    using src_t   = s_type_;                                                                   \
-    using dst_t = d_type_;                                                                         \
+    using dst_t   = d_type_;                                                                   \
-    using u_fun = typename impl::Cast;                                                             \
+    using u_fun   = typename impl::Cast;                                                       \
-    using problem =                                                                                \
+    using problem = ck_tile::                                                                  \
-        ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
+        ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
-    using pipeline = ck_tile::ElementwiseUnaryipeline<problem>;                                    \
+    using pipeline = ck_tile::ElementwiseUnaryipeline<problem>;                                \
-    using kernel   = ck_tile::ElementwiseUnaryKernel<pipeline>;                                    \
+    using kernel   = ck_tile::ElementwiseUnaryKernel<pipeline>;                                \
-                                                                                                   \
+                                                                                               \
-    auto kargs            = kernel::MakeKargs(a);                                                  \
+    auto kargs            = kernel::MakeKargs(a);                                              \
-    const dim3 grids      = kernel::GridSize(a);                                                   \
+    const dim3 grids      = kernel::GridSize(a);                                               \
-    constexpr dim3 blocks = kernel::BlockSize();                                                   \
+    constexpr dim3 blocks = kernel::BlockSize();                                               \
-                                                                                                   \
+                                                                                               \
-    float ave_time = ck_tile::launch_kernel(                                                       \
+    float ave_time = ck_tile::launch_kernel(                                                   \
-        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels));   \
+        s,                                                                                     \
+        ck_tile::make_kernel<blocks.x, 1>(                                                     \
+            kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels));     \
    return ave_time;
 float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
@@ -36,49 +38,63 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
        if(t.output_type == "fp32" && t.input_type == "fp16")
        {
            constexpr int eb = sizeof(ck_tile::fp16_t);
-            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
+            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
-                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 64)
+            {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 64)
            }
-            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
-                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 128)
+            {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 128)
            }
-            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
-                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 256)
+            {
+                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
            }
-            else if (a.num_pixels % 4 == 0) {
+            else if(a.num_pixels % 4 == 0)
-                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
+            {
+                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
+                {
                    DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256)
                }
-                else {
+                else
+                {
                    DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256)
                }
            }
-            else {
+            else
+            {
                DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
            }
        }
        else if(t.output_type == "fp16" && t.input_type == "fp32")
        {
            constexpr int eb = sizeof(float);
-            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
+            if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
-                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 64)
+            {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 64)
            }
-            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
-                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 128)
+            {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 128)
            }
-            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
+            else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
-                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
+            {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
            }
-            else if (a.num_pixels % 4 == 0) {
+            else if(a.num_pixels % 4 == 0)
-                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
+            {
+                if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
+                {
                    DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256)
                }
-                else {
+                else
+                {
                    DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256)
                }
            }
-            else {
+            else
-                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
+            {
+                DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
            }
        }
    }

--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
@@ -57,17 +57,15 @@ struct ElementwiseUnaryKernel
    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
-    CK_TILE_DEVICE void operator()(const void* p_input_,
+    CK_TILE_DEVICE void
-        void* p_output_,
+    operator()(const void* p_input_, void* p_output_, uint64_t num_pixels_) const
-        uint64_t num_pixels_) const
    {
        uint64_t block_base =
            static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize;
        uint64_t pixels_rem = num_pixels_ - block_base;
        const auto input_window = [&]() {
-            const InputType* p_input =
+            const InputType* p_input = reinterpret_cast<const InputType*>(p_input_) + block_base;
-                reinterpret_cast<const InputType*>(p_input_) + block_base;
            auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
                p_input,
@@ -79,8 +77,7 @@ struct ElementwiseUnaryKernel
        }();
        auto output_window = [&]() {
-            OutputType* p_output =
+            OutputType* p_output = reinterpret_cast<OutputType*>(p_output_) + block_base;
-                reinterpret_cast<OutputType*>(p_output_) + block_base;
            auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
                p_output,

--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
@@ -13,7 +13,7 @@
 #endif
 namespace ck_tile {
 template <typename Problem_, typename Policy_ = ElementwiseUnaryPolicy>
 struct ElementwiseUnaryipeline
 {
@@ -37,7 +37,8 @@ struct ElementwiseUnaryipeline
        static_for<0, Problem::Chunks, 1>{}([&](auto) {
            auto x = load_tile(inp_win);
-            auto y = make_static_distributed_tensor<typename Problem::OutputType>(x.get_tile_distribution());
+            auto y = make_static_distributed_tensor<typename Problem::OutputType>(
+                x.get_tile_distribution());
            tile_elementwise_inout(UnaryFunctor{}, y, x);
            store_tile(out_win, y);

--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_problem.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_problem.hpp
@@ -28,10 +28,10 @@ struct ElementwiseUnaryWarpPerRowProblem
    static constexpr index_t WarpSize      = get_warp_size();
    static_assert(BytesPerIssue % sizeof(InputType) == 0);
-    static constexpr index_t VectorSize   = BytesPerIssue / sizeof(InputType);
+    static constexpr index_t VectorSize    = BytesPerIssue / sizeof(InputType);
-    static constexpr index_t LanesPerRow  = WarpSize;
+    static constexpr index_t LanesPerRow   = WarpSize;
    static constexpr index_t WarpsPerBlock = BlockSize / LanesPerRow;
-    static constexpr index_t IssuesPerRow = 1;
+    static constexpr index_t IssuesPerRow  = 1;
 #if 0
    static_assert(Experts % VectorSize == 0);