update unary element-wise

e34eeb6d · carlushuang · 6252d207 · e34eeb6d · e34eeb6d · e34eeb6d
Commit e34eeb6d authored Sep 13, 2024 by carlushuang
13 changed files
--- a/example/ck_tile/19_elementwise/CMakeLists.txt
+++ b/example/ck_tile/19_elementwise/CMakeLists.txt
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+add_executable(tile_example_elementwise EXCLUDE_FROM_ALL elementwise.cpp elementwise_api.cpp)
+target_include_directories(tile_example_elementwise PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_compile_options(tile_example_elementwise PRIVATE -v --save-temps -Wno-gnu-line-marker)
--- a/example/ck_tile/19_elementwise/elementwise.cpp
+++ b/example/ck_tile/19_elementwise/elementwise.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+#include "ck_tile/core.hpp"
+#include "elementwise_api.hpp"
+#ifndef TEST_ELEMENTWISE_VERBOSE
+#define TEST_ELEMENTWISE_VERBOSE 1
+#endif
+template <typename T>
+void dump_host_tensor_2d(const ck_tile::HostTensor<T>& x)
+{
+    auto len = x.get_lengths();
+    assert(len.size() == 2);
+    std::cout << "[";
+    for(size_t i = 0; i < len[0]; i++)
+    {
+        std::cout << i << ": [";
+        for(size_t j = 0; j < len[1]; j++)
+        {
+            if constexpr(std::is_same_v<T, ck_tile::fp16_t>)
+            {
+                auto v = ck_tile::type_convert<float>(x(std::vector<std::size_t>{i, j}));
+                std::cout << v;
+                if(j != len[1] - 1)
+                    std::cout << ",";
+            }
+            else
+            {
+                std::cout << x(std::vector<std::size_t>{i, j}) << " ";
+            }
+        }
+        std::cout << "]";
+        if(i != len[0] - 1)
+            std::cout << ",";
+        else
+            std::cout << "]";
+        std::cout << std::endl;
+    }
+    std::cout << "--------------------" << std::endl;
+}
+struct Cast
+{
+    template <typename DstType, typename SrcType>
+    CK_TILE_HOST_DEVICE void operator()(DstType& y, const SrcType& x) const
+    {
+        y = ck_tile::type_convert<DstType>(x);
+    };
+};
+// CPU reference
+template <typename DstType, typename SrcType, typename UnaryF>
+auto reference_elementwise_unary(const ck_tile::HostTensor<SrcType>& x)
+{
+    using namespace ck_tile;
+    auto y = ck_tile::HostTensor<DstType>(x.get_lengths());
+    y.ForEach([&](auto& self, auto idx) { UnaryF{}(self(idx), x(idx)); });
+    return y;
+}
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("op", "cast", "which elementwise operator to run")
+        .insert("pr_i", "fp16", "input precision")
+        .insert("pr_o", "fp32", "output precision")
+        .insert("n", "1000", "number of pixels to cast")
+        .insert("seed", "-1", "seed to be used, -1 means random every time")
+        .insert("kname", "0", "t to 1 will print kernel name");
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+template <typename DstType, typename SrcType>
+bool test_cast(ck_tile::ArgParser args)
+{
+    int validate            = args.get_int("v");
+    std::string input_prec  = args.get_str("pr_i");
+    std::string output_prec = args.get_str("pr_o");
+    uint64_t num_pixels     = args.get_uint64("n");
+    int seed                = args.get_int("seed");
+    if(seed < 0)
+    {
+        seed = std::time(nullptr);
+    }
+    // tokens already considered batch size
+    ck_tile::HostTensor<SrcType> x_host({num_pixels});
+    ck_tile::HostTensor<DstType> y_host({num_pixels});
+    ck_tile::FillUniformDistribution<SrcType>{-5, 5, seed}(x_host);
+    ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
+    x_dev.ToDevice(x_host.data());
+    elementwise_trait trait = [&]() {
+        elementwise_trait t_;
+        t_.input_type  = input_prec;
+        t_.output_type = output_prec;
+        t_.op          = std::string("cast");
+        return t_;
+    }();
+    elementwise_kargs karg = [&]() {
+        elementwise_kargs a_;
+        a_.p_input    = x_dev.GetDeviceBuffer();
+        a_.p_output   = y_dev.GetDeviceBuffer();
+        a_.num_pixels = num_pixels;
+        return a_;
+    }();
+#if TEST_ELEMENTWISE_VERBOSE
+    ck_tile::stream_config sc{nullptr, true};
+    // ck_tile::stream_config sc{nullptr};
+    auto ms = elementwise(trait, karg, sc);
+    printf(
+        "[cast] %s->%s, n:%lu,  ms:%f, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms);
+    if(ms < 0)
+        printf("not supported\n");
+    fflush(stdout);
+#else
+    ck_tile::stream_config sc{nullptr};
+    auto ms = elementwise_unary(trait, karg, sc);
+#endif
+    if(ms < 0)
+    {
+        return false;
+    }
+    y_dev.FromDevice(y_host.data());
+    bool rtn = true;
+    if(validate)
+    {
+        // this host buffer will not copy to GPU, so no need use stride
+        auto y_ref = reference_elementwise_unary<DstType, SrcType, Cast>(x_host);
+        auto [rtol, atol] = get_elimit<SrcType>("");
+        rtn &= ck_tile::check_err(
+            y_host, y_ref, std::string("Value Error: Incorrect results!"), rtol, atol);
+        printf("valid:%s", rtn ? "y" : "n");
+        fflush(stdout);
+    }
+#if TEST_ELEMENTWISE_VERBOSE
+    printf("\n");
+    fflush(stdout);
+#endif
+    return rtn;
+}
+int main(int argc, char** argv)
+{
+    auto [result, args] = create_args(argc, argv);
+    if(!result)
+        return -1;
+    std::string input_prec  = args.get_str("pr_i");
+    std::string output_prec = args.get_str("pr_o");
+    std::string op          = args.get_str("op");
+    bool r = true;
+    if(op.compare("cast") == 0)
+    {
+        if(input_prec.compare("fp16") == 0 && output_prec.compare("fp32") == 0)
+        {
+            r &= test_cast<float, ck_tile::fp16_t>(args);
+        }
+        else if(input_prec.compare("fp32") == 0 && output_prec.compare("fp16") == 0)
+        {
+            r &= test_cast<ck_tile::fp16_t, float>(args);
+        }
+    }
+    return r ? 0 : -1;
+}
--- a/example/ck_tile/19_elementwise/elementwise_api.cpp
+++ b/example/ck_tile/19_elementwise/elementwise_api.cpp
+#include "elementwise_api.hpp"
+namespace impl {
+struct Cast
+{
+    template <typename DstType, typename SrcType>
+    CK_TILE_HOST_DEVICE void operator()(DstType& y, const SrcType& x) const
+    {
+        y = ck_tile::type_convert<DstType>(x);
+    };
+};
+} // namespace impl
+#define DISPATCH_ELEMENTWISE_CAST(d_type_, s_type_, byte_per_issue_, chunks_)                      \
+    using src_t = s_type_;                                                                         \
+    using dst_t = d_type_;                                                                         \
+    using u_fun = typename impl::Cast;                                                             \
+    using problem =                                                                                \
+        ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_>; \
+    using pipeline = ck_tile::ElementwiseUnaryipeline<problem>;                                    \
+    using kernel   = ck_tile::ElementwiseUnaryKernel<pipeline>;                                    \
+                                                                                                   \
+    auto kargs            = kernel::MakeKargs(a);                                                  \
+    const dim3 grids      = kernel::GridSize(a);                                                   \
+    constexpr dim3 blocks = kernel::BlockSize();                                                   \
+                                                                                                   \
+    float ave_time = ck_tile::launch_kernel(                                                       \
+        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));                  \
+    return ave_time;
+float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
+{
+    float rtn = -1;
+    if(t.op == "cast")
+    {
+        if(t.output_type == "fp32" && t.input_type == "fp16")
+        {
+            DISPATCH_ELEMENTWISE_CAST(float, ck_tile::fp16_t, sizeof(ck_tile::fp16_t), 8)
+        }
+        else if(t.output_type == "fp16" && t.input_type == "fp32")
+        {
+            DISPATCH_ELEMENTWISE_CAST(ck_tile::fp16_t, float, sizeof(float), 8)
+        }
+    }
+    return rtn;
+}
--- a/example/ck_tile/19_elementwise/elementwise_api.hpp
+++ b/example/ck_tile/19_elementwise/elementwise_api.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise_unary.hpp"
+#include <string>
+struct elementwise_trait
+{
+    std::string input_type;  // input type
+    std::string acc_type;    // type to do intermediate computation
+    std::string output_type; // type to store out
+    std::string op;
+};
+struct elementwise_kargs : public ck_tile::ElementwiseUnaryHostArgs
+{
+};
+float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s);
--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp"
+#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp"
+#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp"
+#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_problem.hpp"
--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <string>
+#include <type_traits>
+namespace ck_tile {
+struct ElementwiseUnaryHostArgs
+{
+    const void* p_input;
+    void* p_output;
+    uint64_t num_pixels;
+};
+template <typename Pipeline_>
+struct ElementwiseUnaryKernel
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = remove_cvref_t<typename Pipeline::Problem>;
+    using InputType  = typename Problem::InputType;
+    using OutputType = typename Problem::OutputType;
+    struct ElementwiseUnaryKargs
+    {
+        const void* p_input;
+        void* p_output;
+        uint64_t num_pixels;
+    };
+    using Kargs = ElementwiseUnaryKargs;
+    using Hargs = ElementwiseUnaryHostArgs;
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        constexpr index_t issues_per_block =
+            Problem::BlockSize * Problem::Chunks * Problem::VectorSize;
+        index_t grids =
+            static_cast<index_t>((h.num_pixels + issues_per_block - 1) / issues_per_block);
+        return dim3(grids);
+    }
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_input    = h.p_input;
+        k.p_output   = h.p_output;
+        k.num_pixels = h.num_pixels;
+        return k;
+    }
+    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        uint64_t block_base =
+            static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize;
+        uint64_t pixels_rem = kargs.num_pixels - block_base;
+        const auto input_window = [&]() {
+            const InputType* p_input =
+                reinterpret_cast<const InputType*>(kargs.p_input) + block_base;
+            auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                p_input,
+                make_tuple(static_cast<index_t>(pixels_rem)),
+                number<Problem::VectorSize>{});
+            return make_tile_window(
+                tmp, make_tuple(number<Problem::BlockSize * Problem::VectorSize>{}), {0});
+        }();
+        auto output_window = [&]() {
+            OutputType* p_output =
+                reinterpret_cast<OutputType*>(kargs.p_output) + block_base;
+            auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                p_output,
+                make_tuple(static_cast<index_t>(pixels_rem)),
+                number<Problem::VectorSize>{});
+            return make_tile_window(
+                tmp, make_tuple(number<Problem::BlockSize * Problem::VectorSize>{}), {0});
+        }();
+        index_t loop_stride =
+            __builtin_amdgcn_readfirstlane(gridDim.x * Problem::BlockSize * Problem::VectorSize);
+        Pipeline{}(input_window, output_window, loop_stride);
+    }
+};
+} // namespace ck_tile
--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp"
+#include <string>
+#include <type_traits>
+#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
+#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 1
+#endif
+namespace ck_tile {
+template <typename Problem_, typename Policy_ = ElementwiseUnaryPolicy>
+struct ElementwiseUnaryipeline
+{
+    // TODO: this kernel only support warp per row
+    using Problem      = remove_cvref_t<Problem_>;
+    using Policy       = remove_cvref_t<Policy_>;
+    using UnaryFunctor = typename Problem::UnaryFunctor;
+    template <typename InputWindow, typename OutputWindow>
+    CK_TILE_DEVICE auto
+    operator()(const InputWindow& inp_window, OutputWindow& out_window, index_t loop_stride)
+    {
+        auto inp_win = make_tile_window(inp_window.get_bottom_tensor_view(),
+                                        inp_window.get_window_lengths(),
+                                        inp_window.get_window_origin(),
+                                        Policy::template MakeInputDistribution<Problem>());
+        auto out_win = make_tile_window(out_window.get_bottom_tensor_view(),
+                                        out_window.get_window_lengths(),
+                                        out_window.get_window_origin(),
+                                        Policy::template MakeOutputDistribution<Problem>());
+        static_for<0, Problem::Chunks, 1>{}([&](auto) {
+            auto x = load_tile(inp_win);
+            auto y = make_static_distributed_tensor<typename Problem::OutputType>(x.get_tile_distribution());
+            tile_elementwise_inout(UnaryFunctor{}, y, x);
+            store_tile(out_win, y);
+            move_tile_window(inp_win, {loop_stride});
+            move_tile_window(out_win, {loop_stride});
+        });
+    }
+};
+} // namespace ck_tile
--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+struct ElementwiseUnaryPolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution()
+    {
+        // TODO: Y dim must have one dim that is not reduced
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<Problem::IssuesPerRow,
+                                                      Problem::WarpsPerBlock,
+                                                      Problem::LanesPerRow,
+                                                      Problem::VectorSize>>,
+                                       tuple<sequence<1>, sequence<1>>,
+                                       tuple<sequence<1>, sequence<2>>,
+                                       sequence<1, 1>,
+                                       sequence<0, 3>>{});
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<Problem::IssuesPerRow,
+                                                      Problem::WarpsPerBlock,
+                                                      Problem::LanesPerRow,
+                                                      Problem::VectorSize>>,
+                                       tuple<sequence<1>, sequence<1>>,
+                                       tuple<sequence<1>, sequence<2>>,
+                                       sequence<1, 1>,
+                                       sequence<0, 3>>{});
+    }
+};
+} // namespace ck_tile
--- a/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_problem.hpp
+++ b/example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include <string>
+#include <type_traits>
+namespace ck_tile {
+template <typename InputType_,
+          typename OutputType_,
+          typename UnaryFunctor_,
+          index_t BytesPerIssue_ = sizeof(InputType_), // this is input
+          index_t Chunks_        = 8,
+          index_t BlockSize_     = 256>
+struct ElementwiseUnaryWarpPerRowProblem
+{
+    // TODO: this kernel only support warp per row
+    using InputType    = remove_cvref_t<InputType_>;
+    using OutputType   = remove_cvref_t<OutputType_>;
+    using UnaryFunctor = remove_cvref_t<UnaryFunctor_>;
+    static constexpr index_t Chunks        = Chunks_;
+    static constexpr index_t BytesPerIssue = BytesPerIssue_;
+    static constexpr index_t BlockSize     = BlockSize_;
+    static constexpr index_t WarpSize      = get_warp_size();
+    static_assert(BytesPerIssue % sizeof(InputType) == 0);
+    static constexpr index_t VectorSize   = BytesPerIssue / sizeof(InputType);
+    static constexpr index_t LanesPerRow  = WarpSize;
+    static constexpr index_t WarpsPerBlock = BlockSize / LanesPerRow;
+    static constexpr index_t IssuesPerRow = 1;
+#if 0
+    static_assert(Experts % VectorSize == 0);
+    static constexpr index_t LanesPerRow = min(Experts / VectorSize, WarpSize);
+    static_assert(WarpSize % LanesPerRow == 0);
+    static constexpr index_t RowsPerWarpPerColIssue = WarpSize / LanesPerRow;
+    static constexpr index_t RowsPerWarp            = IssuesPerCol * RowsPerWarpPerColIssue;
+    static constexpr index_t IssuesPerRow           = Experts / (LanesPerRow * VectorSize);
+    static constexpr index_t WarpsPerBlock = BlockSize / WarpSize;
+    static constexpr index_t RowsPerBlock  = RowsPerWarp * WarpsPerBlock;
+#endif
+};
+} // namespace ck_tile
--- a/example/ck_tile/19_elementwise/include/ck_tile/remod.py
+++ b/example/ck_tile/19_elementwise/include/ck_tile/remod.py
+import pathlib
+from pathlib import Path
+import subprocess
+import os
+import copy
+NS = 'ck_tile'
+OPS = 'ops'
+OPS_COMMON = 'common' # common header will be duplicated into ops/* other module
+HEADER_COMMON = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+"""
+# aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp)
+def get_module(f, level = 0):
+    all_parts = f.parts
+    return str(all_parts[level])
+all_files = []
+for p in sorted(Path("./").rglob("*")):
+    if p.suffix == '.hpp':
+        all_files.append(pathlib.PurePath(p))
+class submodule_t:
+    def __init__(self):
+        self.m = dict()
+    def push(self, f):
+        if len(f.parents) != 1: # ignore ./xxx.hpp
+            mod = get_module(f)
+            if mod == OPS:
+                if mod not in self.m.keys():
+                    self.m[mod] = dict()
+                mod2 = get_module(f, 1)
+                if Path(mod2).suffix != '.hpp':
+                    # ignore ops/xxx.hpp
+                    if mod2 not in self.m[mod].keys():
+                        self.m[mod][mod2] = list()
+                    self.m[mod][mod2].append(f)
+            else:
+                if mod not in self.m.keys():
+                    self.m[mod] = list()
+                self.m[mod].append(f)
+    def gen(self):
+        def gen_header(hpath, include_list):
+            # print(hpath)
+            if os.path.exists(str(hpath)):
+                os.remove(str(hpath))
+            with hpath.open('w') as f:
+                f.write(HEADER_COMMON)
+                f.write('#pragma once\n')
+                f.write('\n')
+                for individual_header in include_list:
+                    header_path = NS + '/' + str(individual_header)
+                    f.write(f'#include \"{header_path}\"\n')
+                # f.write('\n') # otherwise clang-format will complain
+        # print(self.m)
+        # restructure common
+        for k, v in self.m.items():
+            if k == OPS and OPS_COMMON in v.keys():
+                common_list = copy.deepcopy(v[OPS_COMMON])
+                # v.pop(OPS_COMMON)
+                for km in v.keys():
+                    if km != OPS_COMMON:
+                        v[km].extend(common_list)
+        for k, v in self.m.items():
+            if k == OPS:
+                for km, kv in v.items():
+                    gen_header(Path(k) / (f'{km}.hpp'), kv)
+            else:
+                gen_header(Path(f'{k}.hpp'), v)
+submodule = submodule_t()
+# formatting
+for x in all_files:
+    subprocess.Popen(f'dos2unix {str(x)}', shell=True)
+    cmd = f'clang-format-12 -style=file -i {str(x)}'
+    #for xp in x.parents:
+    #print(get_file_base(x))
+    subprocess.Popen(cmd, shell=True)
+    submodule.push(x)
+submodule.gen()
+#print(all_files)
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(01_fmha)
 add_subdirectory(02_layernorm2d)
 add_subdirectory(03_gemm)
 add_subdirectory(06_permute)
+add_subdirectory(19_elementwise)
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -1079,6 +1079,15 @@ struct ConvScaleRelu
    float scale_out_;
 };
+template<typename DstType, typename SrcType>
+struct Cast {
+    template <typename T>
+    CK_TILE_HOST_DEVICE void operator()(DstType& y, const SrcType& x) const
+    {
+        y = ck_tile::type_convert<DstType>(x);
+    };
+};
 // support fastconvert of int8 to fp16
 #if 0
 template <typename InputDataType, typename OutputDataType, index_t RegPackNumber>

--- a/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
+++ b/include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
@@ -9,7 +9,7 @@
 #include <type_traits>
 #ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
-#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 0
+#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 1
 #endif
 namespace ck_tile {