Merge

7e297b13 · Paul · 86ea5e91 · aa7ff911 · 86ea5e91 · 7e297b13
Commit 7e297b13 authored Jun 13, 2022 by Paul
20 changed files
--- a/src/targets/gpu/compile_pointwise.cpp
+++ b/src/targets/gpu/compile_pointwise.cpp
-#include <migraphx/gpu/compile_pointwise.hpp>
-#include <migraphx/gpu/compile_hip_code_object.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-static const char* const pointwise_kernel = R"__migraphx__(
-#include <migraphx/kernels/index.hpp>
-#include <migraphx/kernels/pointwise.hpp>
-#include <args.hpp>
-
-using namespace migraphx;
-
-extern "C" {
-__global__ void kernel(${params}) 
-{
-    pointwise(${lambda}, ${args});
-}
-    
-}
-
-int main() {}
-
-)__migraphx__";
-
-std::string enum_params(std::size_t count, std::string param)
-{
-    std::vector<std::string> items(count);
-    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
-    return join_strings(items, ",");
-}
-
-std::size_t compute_global(std::size_t n, std::size_t local = 1024)
-{
-    std::size_t groups  = (n + local - 1) / local;
-    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
-    return nglobal;
-}
-
-operation compile_pointwise(context&, const std::vector<shape>& inputs, const std::string& lambda)
-{
-    hip_compile_options options;
-    options.global         = compute_global(inputs.front().elements());
-    options.local          = 1024;
-    options.inputs         = inputs;
-    options.output         = inputs.back();
-    options.reduced_inputs = reduce_dims(inputs);
-    auto src               = interpolate_string(pointwise_kernel,
-                                  {{"params", enum_params(inputs.size(), "void * private_p")},
-                                   {"args", enum_params(inputs.size(), "private_p")},
-                                   {"lambda", lambda}});
-    return compile_hip_code_object(src, options);
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/compiler.cpp
+++ b/src/targets/gpu/compiler.cpp
+#include <migraphx/gpu/compiler.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+auto& compiler_map()
+{
+    static std::unordered_map<std::string, compiler_compile> m; // NOLINT
+    return m;
+}
+
+auto& compiler_op_map()
+{
+    static std::unordered_map<std::string, compiler_compile_op> m; // NOLINT
+    return m;
+}
+
+void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop)
+{
+    compiler_map()[name]    = std::move(c);
+    compiler_op_map()[name] = std::move(cop);
+}
+
+bool has_compiler_for(const std::string& name) { return compiler_map().count(name) > 0; }
+compiler_replace compile(context& ctx, instruction_ref ins, const operation& op)
+{
+    return compiler_map().at(op.name())(ctx, ins, op);
+}
+operation
+compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v)
+{
+    return compiler_op_map().at(name)(ctx, inputs, v);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/fill.cpp
+++ b/src/targets/gpu/device/fill.cpp
+#include <migraphx/gpu/device/fill.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void fill(hipStream_t stream, const argument& result, unsigned long val)
+{
+    nary(stream, result)([=]() __device__ { return val; });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+
+#include <migraphx/requires.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class... Ts>
+using common_type = typename std::common_type<Ts...>::type;
+
+template <class T, MIGRAPHX_REQUIRES(is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return std::isfinite(x) and std::isfinite(y) and
+           std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
+           std::nextafter(x, std::numeric_limits<T>::max()) >= y;
+}
+
+template <class T, MIGRAPHX_REQUIRES(not is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return x == y;
+}
+
+template <class T, class U>
+__device__ bool float_equal(T x, U y)
+{
+    return float_equal_device<common_type<T, U>>(x, y);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -75,8 +75,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index) -> decltype(

 inline auto gs_launch(hipStream_t stream, index_int n, index_int local = 1024)
 {
-    index_int groups  = (n + local - 1) / local;
-    index_int nglobal = std::min<index_int>(256, groups) * local;
+    index_int groups = (n + local - 1) / local;
+    // max possible number of blocks is set to 1B (1,073,741,824)
+    index_int nglobal = std::min<index_int>(1073741824, groups) * local;

    return [=](auto f) {
        launch(stream, nglobal, local)([=](auto idx) __device__ {

--- a/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
@@ -57,9 +57,10 @@ inline auto mi_nglobal(const hip_shape<N>& s, index_int nlocal)
 {
    assert(s.standard);
    assert(s.elements() > 0);
-    index_int n       = s.elements();
-    index_int groups  = (n + nlocal - 1) / nlocal;
-    index_int nglobal = std::min<index_int>(128, groups) * nlocal;
+    index_int n      = s.elements();
+    index_int groups = (n + nlocal - 1) / nlocal;
+    // max possible number of blocks is set to 1B (1,073,741,824)
+    index_int nglobal = std::min<index_int>(1073741824, groups) * nlocal;

    assert(groups > 0);
    assert(nglobal > 0);

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-#if __AMDGCN_WAVEFRONT_SIZE == 32
-#define MIGRAPHX_NO_DPP
-#endif
-
 #ifdef MIGRAPHX_NO_DPP
 template <index_int N,
          class Op,
@@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op)
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
    in  = op(in, out);
+#if __AMDGCN_WAVEFRONT_SIZE == 64
    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
    in  = op(in, out);
+#endif
 }

 __device__ inline void dpp_reduce(float& x, sum)
@@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum)
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
                     "s_nop 1\n"
+#if __AMDGCN_WAVEFRONT_SIZE == 64
                     "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
+#endif
                     "s_nop 1\n"
                     : "=v"(x)
                     : "0"(x));
@@ -135,21 +135,27 @@ template <index_int N,
          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
 __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
 {
-    using type = decltype(f(deduce_for_stride(fs)));
-    MIGRAPHX_DEVICE_SHARED type buffer[N / 64];
+
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+    constexpr index_int nthreads = 16;
+#else
+    constexpr index_int nthreads = 64;
+#endif
+    using type                   = decltype(f(deduce_for_stride(fs)));
+    MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
    type x = init;
    fs([&](auto i) { x = op(x, f(i)); });
    dpp_reduce(x, op);

-    const auto ldsidx = idx.local / 64;
-    if((idx.local % 64) == 63)
+    const auto ldsidx = idx.local / nthreads;
+    if((idx.local % nthreads) == nthreads - 1)
    {
        buffer[ldsidx] = x;
    }
    __syncthreads();

    type y = init;
-    for(index_int i = 0; i < idx.nlocal() / 64; i++)
+    for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
    {
        y = op(y, buffer[i]);
    }

--- a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
@@ -44,12 +44,19 @@ __device__ void block_scan(index idx, Op op, T init, ForStride fs, Input input,
 template <index_int N, class Op, class T, class Input, class Output>
 __device__ void block_scan(index idx, Op op, T init, index_int n, Input input, Output output)
 {
-    block_scan<N>(idx,
-                  op,
-                  init,
-                  [&](auto f) -> decltype(f(index_int{})) { return idx.local_stride(n, f); },
-                  input,
-                  output);
+    block_scan<N>(
+        idx,
+        op,
+        init,
+        [&](auto f) -> decltype(f(index_int{})) { return idx.local_stride(n, f); },
+        input,
+        output);
+}
+
+template <class F>
+constexpr auto reverse_scan(index_int n, F f)
+{
+    return [=](auto i, auto&&... xs) { return f(n - i - 1, xs...); };
 }

 } // namespace device

--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -129,6 +129,21 @@ __device__ __host__ T to_hip_type(T x)
 // Hip doens't support __fp16
 inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }

+#define MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(trait, T) \
+    template <class X>                             \
+    struct trait : std::trait<X>                   \
+    {                                              \
+    };                                             \
+                                                   \
+    template <>                                    \
+    struct trait<T> : std::true_type               \
+    {                                              \
+    };
+
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, __fp16)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, __fp16)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, __fp16)
+
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
@@ -14,28 +14,23 @@ constexpr void visit_tensor_size(index_int n, F f)
 {
    switch(n)
    {
-    case 1:
-    {
+    case 1: {
        f(std::integral_constant<index_int, 1>{});
        break;
    }
-    case 2:
-    {
+    case 2: {
        f(std::integral_constant<index_int, 2>{});
        break;
    }
-    case 3:
-    {
+    case 3: {
        f(std::integral_constant<index_int, 3>{});
        break;
    }
-    case 4:
-    {
+    case 4: {
        f(std::integral_constant<index_int, 4>{});
        break;
    }
-    case 5:
-    {
+    case 5: {
        f(std::integral_constant<index_int, 5>{});
        break;
    }
@@ -181,7 +176,13 @@ template <index_int N, class T, class... Ts>
 auto hip_vec_visit_all(T&& x, Ts&&... xs)
 {
    return [&](auto f) {
-        hip_visit_all_impl(get_shape(x),
+        auto sx   = get_shape(x);
+        auto lens = sx.lens();
+        assert(lens.back() % N == 0);
+        assert(sx.strides().back() == 1);
+        lens.back() /= N;
+        shape vec_sx{sx.type(), lens};
+        hip_visit_all_impl(vec_sx,
                           make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }),
                           f,
                           x,

--- a/src/targets/gpu/device/layernorm.cpp
+++ b/src/targets/gpu/device/layernorm.cpp
@@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

+#ifndef MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 1
+#else
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 0
+#endif
+#endif
+
 template <class T>
 struct vector_type
 {
@@ -86,10 +94,13 @@ __device__ void layernorm(index_int i,
    const bool in_range    = idx.local < relements_v;

    auto mean = [&](auto z) {
-        return auto_block_reduce<MaxBlockSize>(
-                   idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
-               value_type(relements);
-
+        auto m = auto_block_reduce<MaxBlockSize>(
+                     idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
+                 value_type(relements);
+#if MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+        __builtin_amdgcn_s_barrier();
+#endif
+        return m;
    };

    // m = x - mean(x)

--- a/src/targets/gpu/device/multinomial.cpp
+++ b/src/targets/gpu/device/multinomial.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/multinomial.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class Iterator, class T>
+constexpr Iterator upper_bound(Iterator first, Iterator last, const T& value)
+{
+    Iterator it;
+    typename std::iterator_traits<Iterator>::difference_type count;
+    typename std::iterator_traits<Iterator>::difference_type step;
+    count = std::distance(first, last);
+
+    while(count > 0)
+    {
+        it   = first;
+        step = count / 2;
+        std::advance(it, step);
+        if(!(value < *it))
+        {
+            first = ++it;
+            count -= step + 1;
+        }
+        else
+            count = step;
+    }
+    return first;
+}
+
+void multinomial(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg0,
+                 const argument& arg1)
+{
+    size_t batch_size  = arg0.get_shape().lens().front();
+    size_t class_size  = arg0.get_shape().lens().back();
+    size_t sample_size = result.get_shape().lens().back();
+
+    hip_visit_all(arg0, arg1)([&](auto cdf, auto dist) {
+        result.visit([&](auto out) {
+            hip_visit_views(out)([&](auto output) {
+                gs_launch(stream, batch_size * sample_size)([=](auto i) __device__ {
+                    auto idx       = output.get_shape().multi(i);
+                    auto cdf_begin = cdf.begin() + (idx.front() * class_size);
+                    auto cdf_end   = cdf_begin + class_size;
+                    auto sample_iter =
+                        upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    output[i] = std::distance(cdf_begin, sample_iter);
+                });
+            });
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/nonzero.cpp
+++ b/src/targets/gpu/device/nonzero.cpp
+#include <migraphx/gpu/device/nonzero.hpp>
+#include <migraphx/gpu/device/float_equal.hpp>
+#include <migraphx/gpu/device/scan.hpp>
+#include <migraphx/gpu/device/reduce_ops.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data)
+{
+    auto s            = arg_data.get_shape();
+    auto elem_num     = s.elements();
+    auto out_elem_num = result.get_shape().elements();
+
+    // call the prefix_sum function to do a prefix_sum to compute
+    // index in the output. Only 1 block can be used since we have
+    // only one prefix sum
+    const index_int block_size = 256;
+    hip_visit_all(arg_data, s)([&](auto input, auto si) {
+        const auto* in_ptr = device_cast(input.data());
+        auto* ptr          = result.cast<int64_t>();
+        gs_launch(stream, block_size, block_size)([=](auto, auto idx) __device__ {
+            // fill all output to 0 first
+            idx.local_stride(out_elem_num, [&](auto j) { ptr[j] = 0; });
+
+            block_scan<block_size>(
+                idx,
+                sum{},
+                0,
+                elem_num,
+                [&](auto j) { return (float_equal(in_ptr[j], 0)) ? 0 : 1; },
+                [&](auto j, auto x) {
+                    auto out_loc = x - 1;
+                    if(float_equal(in_ptr[j], 0))
+                        return;
+
+                    auto index = si.multi(j);
+                    for(size_t k = 0; k < index.size(); ++k)
+                    {
+                        ptr[k * elem_num + out_loc] = index[k];
+                    }
+                });
+        });
+    });
+
+    return result;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/prefix_scan_sum.cpp
+++ b/src/targets/gpu/device/prefix_scan_sum.cpp
 #include <migraphx/gpu/device/prefix_scan_sum.hpp>
 #include <migraphx/gpu/device/scan.hpp>
 #include <migraphx/gpu/device/reduce_ops.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/types.hpp>

 namespace migraphx {
@@ -8,29 +9,108 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-void prefix_scan_sum(hipStream_t stream, const argument& result, const argument& arg, int32_t axis)
+void prefix_scan_sum(hipStream_t stream,
+                     const argument& result,
+                     const argument& arg,
+                     int32_t axis,
+                     bool exclusive,
+                     bool reverse)
 {
-    const index_int block_size = 256;
-    const index_int n          = arg.get_shape().lens()[axis];
-    auto rlens                 = result.get_shape().lens();
-    rlens[axis]                = 1;
+    const index_int max_block_size = 256;
+    const index_int n              = arg.get_shape().lens()[axis];
+    auto rlens                     = result.get_shape().lens();
+    rlens[axis]                    = 1;
+
    hip_visit_all(result, arg, result.get_shape().with_lens(rlens))(
        [=](auto output, auto input, auto rshape) {
-            gs_launch(stream, rshape.elements() * block_size, block_size)(
-                [=](auto i, auto idx) __device__ {
-                    const auto ridx  = rshape.multi(i / block_size);
-                    auto compute_idx = [&](auto j) {
-                        auto k  = ridx;
-                        k[axis] = j;
-                        return k;
-                    };
-                    block_scan<block_size>(idx,
-                                           sum{},
-                                           0,
-                                           n,
-                                           [&](auto j) { return input[compute_idx(j)]; },
-                                           [&](auto j, auto x) { output[compute_idx(j)] = x; });
-                });
+            const index_int block_size = compute_block_size(rshape.elements(), max_block_size);
+            if(reverse and exclusive)
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
+                            reverse_scan(n, [&](auto j, auto x) {
+                                if(j == n - 1)
+                                    output[compute_idx(j)] = 0;
+                                if(j > 0)
+                                    output[compute_idx(j - 1)] = x;
+                            }));
+                    });
+            }
+            else if(reverse)
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            reverse_scan(n, [&](auto j) { return input[compute_idx(j)]; }),
+                            reverse_scan(n, [&](auto j, auto x) { output[compute_idx(j)] = x; }));
+                    });
+            }
+            else if(exclusive)
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            [&](auto j) { return input[compute_idx(j)]; },
+                            [&](auto j, auto x) {
+                                auto k = j + 1;
+                                if(j == 0)
+                                    output[compute_idx(0)] = 0;
+                                if(k < n)
+                                    output[compute_idx(k)] = x;
+                            });
+                    });
+            }
+            else
+            {
+                gs_launch(stream, rshape.elements() * block_size, block_size)(
+                    [=](auto i, auto idx) __device__ {
+                        const auto ridx  = rshape.multi(i / block_size);
+                        auto compute_idx = [&](auto j) {
+                            auto k  = ridx;
+                            k[axis] = j;
+                            return k;
+                        };
+                        block_scan<max_block_size>(
+                            idx,
+                            sum{},
+                            0,
+                            n,
+                            [&](auto j) { return input[compute_idx(j)]; },
+                            [&](auto j, auto x) { output[compute_idx(j)] = x; });
+                    });
+            }
        });
 }


--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -20,34 +20,58 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};

    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
-        const index_int max_block_size = 256;
+        const index_int max_block_size = 128;
        const index_int block_size     = compute_block_size(batch_item_num, max_block_size);
-        gs_launch(stream,
-                  batch_shape.elements() * block_size,
-                  block_size)([=](auto i, auto idx) __device__ {
-            auto data_idx = batch.multi(i / block_size);
-            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
-            type init     = lowest();
-
-            auto batch_max = block_reduce<max_block_size>(
-                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
-                    data_idx[axis] = j;
-                    return input[data_idx];
-                });
+        using type = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+        type init  = lowest();
+
+        if(axis == batch_lens.size() - 1)
+        {
+            gs_launch(stream, batch_shape.elements() * block_size, block_size)(
+                [=](auto i, auto idx) __device__ {
+                    auto start_loc = i / block_size * batch_item_num;
+                    auto batch_max = block_reduce<max_block_size>(
+                        idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                            return input[start_loc + j];
+                        });
+
+                    auto batch_sum = block_reduce<max_block_size>(
+                        idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                            auto val = input[start_loc + j] - batch_max;
+                            return ::exp(to_hip_type(val));
+                        });

-            auto batch_sum =
-                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
-                    data_idx[axis] = j;
-                    auto val       = input[data_idx] - batch_max;
-                    return ::exp(to_hip_type(val));
+                    idx.local_stride(batch_item_num, [&](auto j) __device__ {
+                        auto val              = input[start_loc + j] - batch_max;
+                        output[start_loc + j] = ::exp(to_hip_type(val)) / batch_sum;
+                    });
                });
+        }
+        else
+        {
+            gs_launch(stream, batch_shape.elements() * block_size, block_size)(
+                [=](auto i, auto idx) __device__ {
+                    auto data_idx  = batch.multi(i / block_size);
+                    auto batch_max = block_reduce<max_block_size>(
+                        idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                            data_idx[axis] = j;
+                            return input[data_idx];
+                        });

-            idx.local_stride(batch_item_num, [&](auto j) __device__ {
-                data_idx[axis]   = j;
-                auto val         = input[data_idx] - batch_max;
-                output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
-            });
-        });
+                    auto batch_sum = block_reduce<max_block_size>(
+                        idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                            data_idx[axis] = j;
+                            auto val       = input[data_idx] - batch_max;
+                            return ::exp(to_hip_type(val));
+                        });
+
+                    idx.local_stride(batch_item_num, [&](auto j) __device__ {
+                        data_idx[axis]   = j;
+                        auto val         = input[data_idx] - batch_max;
+                        output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
+                    });
+                });
+        }
    });
 }


--- a/src/targets/gpu/device/topk.cpp
+++ b/src/targets/gpu/device/topk.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/topk.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/visit.hpp>
+#include <migraphx/ranges.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, class Index, class Compare>
+struct hip_heap_vector
+{
+    MIGRAPHX_DEVICE_CONSTEXPR hip_heap_vector(T* val, index_int n, Index v_idx, Compare comp)
+        : data(val), size(n), data_index(v_idx), compare(comp)
+    {
+        make_heap(size);
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR void try_push(const T val)
+    {
+        if(compare(val, data[data_index(0)]))
+            return;
+
+        pop_heap(size - 1);
+        data[data_index(size - 1)] = val;
+        push_heap(size - 1);
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR void sort() { sort_heap(size); }
+
+    private:
+    MIGRAPHX_DEVICE_CONSTEXPR inline static void swap(T& v1, T& v2)
+    {
+        T v = v1;
+        v1  = v2;
+        v2  = v;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void heapify_down(index_int n, index_int index)
+    {
+        while(index < n)
+        {
+            auto pre_index = index;
+            index_int l    = 2 * index + 1;
+            index_int r    = 2 * index + 2;
+
+            if(l < n && compare(data[data_index(l)], data[data_index(index)]))
+            {
+                index = l;
+            }
+
+            if(r < n && compare(data[data_index(r)], data[data_index(index)]))
+            {
+                index = r;
+                if(compare(data[data_index(l)], data[data_index(r)]))
+                {
+                    index = l;
+                }
+            }
+
+            if(index == pre_index)
+            {
+                break;
+            }
+
+            swap(data[data_index(index)], data[data_index(pre_index)]);
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void heapify_up(index_int index)
+    {
+        while(index > 0)
+        {
+            auto parent_idx = (index - 1) / 2;
+
+            if(not compare(data[data_index(index)], data[data_index(parent_idx)]))
+            {
+                break;
+            }
+
+            swap(data[data_index(index)], data[data_index(parent_idx)]);
+            index = parent_idx;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void make_heap(index_int n)
+    {
+        for(int j = n / 2 - 1; j >= 0; --j)
+        {
+            heapify_down(n, j);
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void push_heap(index_int loc) { heapify_up(loc); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void pop_heap(index_int loc)
+    {
+        swap(data[data_index(0)], data[data_index(loc)]);
+        heapify_down(loc, 0);
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR inline void sort_heap(index_int n)
+    {
+        for(int j = n - 1; j > 0; --j)
+        {
+            swap(data[data_index(0)], data[data_index(j)]);
+            heapify_down(j, 0);
+        }
+    }
+
+    T* data = nullptr;
+    index_int size;
+    Index data_index;
+    Compare compare;
+};
+
+template <class T, class Index, class Compare>
+__device__ hip_heap_vector<T, Index, Compare>
+make_heap(T* data, index_int n, Index idx, Compare compare)
+{
+    return {data, n, idx, compare};
+}
+
+template <class Compare>
+std::vector<argument> topk(hipStream_t stream,
+                           const argument& val_res,
+                           const argument& ind_res,
+                           const argument& arg,
+                           int64_t k,
+                           int64_t axis,
+                           Compare compare)
+{
+    auto in_s       = arg.get_shape();
+    auto in_lens    = in_s.lens();
+    auto out_s      = val_res.get_shape();
+    auto axis_dim   = in_s.lens()[axis];
+    auto comp_lens  = in_lens;
+    comp_lens[axis] = 1;
+    shape comp_s{in_s.type(), comp_lens};
+    std::size_t elem_num = comp_s.elements();
+
+    hip_visit_all(val_res, arg, out_s, in_s, comp_s)(
+        [&](auto out_val, auto input, auto oss, auto iss, auto css) {
+            auto* data      = device_cast(input.data());
+            auto* out       = device_cast(out_val.data());
+            auto* const ind = ind_res.cast<int64_t>();
+            gs_launch(stream, elem_num)([=](auto i) __device__ {
+                auto idx = css.multi(i);
+
+                auto in_idx = [&](int ii) {
+                    auto iidx  = idx;
+                    iidx[axis] = ii;
+                    return iss.index(iidx);
+                };
+
+                auto out_idx = [&](int ii) {
+                    auto iidx  = idx;
+                    iidx[axis] = ii;
+                    return oss.index(iidx);
+                };
+
+                auto data_compare = [=](auto ii, auto jj) {
+                    return compare(data[in_idx(ii)], data[in_idx(jj)]);
+                };
+
+                for(int j = 0; j < k; ++j)
+                {
+                    ind[out_idx(j)] = j;
+                }
+
+                auto hp = make_heap(ind, k, out_idx, data_compare);
+                for(int j = k; j < axis_dim; ++j)
+                {
+                    hp.try_push(j);
+                }
+                hp.sort();
+
+                for(int j = 0; j < k; ++j)
+                {
+                    out[out_idx(j)] = data[in_idx(ind[out_idx(j)])];
+                }
+            });
+        });
+
+    return {val_res, ind_res};
+}
+
+argument topk_largest(hipStream_t stream,
+                      const argument& val_res,
+                      const argument& ind_res,
+                      const argument& arg,
+                      int64_t k,
+                      int64_t axis)
+{
+    return {topk(stream, val_res, ind_res, arg, k, axis, std::less<>{})};
+}
+
+argument topk_smallest(hipStream_t stream,
+                       const argument& val_res,
+                       const argument& ind_res,
+                       const argument& arg,
+                       int64_t k,
+                       int64_t axis)
+{
+    return {topk(stream, val_res, ind_res, arg, k, axis, std::greater<>{})};
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/where.cpp
+++ b/src/targets/gpu/device/where.cpp
+#include <migraphx/gpu/device/where.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class Shape>
+constexpr auto get_rank(const Shape&)
+{
+    return decltype(typename Shape::hip_index{}.size()){};
+}
+
+void where(hipStream_t stream,
+           const argument& result,
+           const argument& arg0,
+           const argument& arg1,
+           const argument& arg2)
+{
+    hip_visit_all(result, arg1, arg2)([&](auto output, auto x, auto y) {
+        hip_visit_all(arg0)([&](auto cond) {
+            if constexpr(get_rank(cond.get_shape()) == get_rank(output.get_shape()))
+            {
+                gs_launch(stream, arg1.get_shape().elements())([=](auto idx) __device__ {
+                    auto i    = output.get_shape().multi(idx);
+                    output[i] = cond[i] ? x[i] : y[i];
+                });
+            }
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/driver/CMakeLists.txt
+++ b/src/targets/gpu/driver/CMakeLists.txt

+file(GLOB GPU_DRIVER_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 add_executable(gpu-driver
-    action.cpp
-    compile_pointwise.cpp
-    main.cpp
-    parser.cpp
-    perf.cpp
-    run_op.cpp
+    ${GPU_DRIVER_SRCS}
 )
 target_include_directories(gpu-driver PRIVATE include)
 target_link_libraries(gpu-driver PRIVATE migraphx_gpu)
--- a/src/targets/gpu/driver/compile_pointwise.cpp
+++ b/src/targets/gpu/driver/compile_pointwise.cpp
 #include <migraphx/gpu/driver/action.hpp>
 #include <migraphx/gpu/driver/perf.hpp>
-#include <migraphx/gpu/compile_pointwise.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -8,13 +8,13 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {

-struct compile_pointwise : action<compile_pointwise>
+struct compile_op : action<compile_op>
 {
    static void apply(const parser& p, const value& v)
    {
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
-        auto op     = gpu::compile_pointwise(ctx, inputs, v.at("lambda").to<std::string>());
+        auto op     = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
        double t    = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
        std::cout << op << ": " << t << "ms" << std::endl;
    }

--- a/src/targets/gpu/driver/main.cpp
+++ b/src/targets/gpu/driver/main.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/json.hpp>
 #include <migraphx/convert_to_json.hpp>
 #include <migraphx/file_buffer.hpp>
+#include <iostream>

 using namespace migraphx;              // NOLINT
 using namespace migraphx::gpu;         // NOLINT