Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into scatter-op

3d08f40f · Shucai Xiao · 50cfbcda · 6ba279cc · 3d08f40f · 3d08f40f
Commit 3d08f40f authored Jul 08, 2021 by Shucai Xiao
20 changed files
--- a/src/targets/gpu/preallocate_param.cpp
+++ b/src/targets/gpu/preallocate_param.cpp
-#include <migraphx/gpu/preallocate_param.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/program.hpp>
+#include <migraphx/preallocate_param.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/make_op.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/ranges.hpp>
-#include <migraphx/stringutils.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {

-void preallocate_param::apply(module& p) const
+void preallocate_param::apply(module& m) const
 {
-    for(auto ins : iterator_for(p))
+    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "@param")
            continue;
        if(param != any_cast<builtin::param>(ins->get_operator()).parameter)
            continue;
-        std::string id = p.name() + ":" + param;
-        auto r         = p.insert_instruction(ins, hip_allocate_memory{ins->get_shape(), id});
-        p.replace_instruction(ins, r);
+        std::string id = m.name() + ":" + param;
+        auto r         = m.insert_instruction(ins, model.preallocate(ins->get_shape(), id));
+        m.replace_instruction(ins, r);
    }
 }

-} // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -19,6 +19,7 @@ add_library(migraphx_cpu
    logsoftmax.cpp
    lowering.cpp
    lrn.cpp
+    preallocate.cpp
    pooling.cpp
    reduction.cpp
    reorder.cpp

--- a/src/targets/cpu/allocation_model.cpp
+++ b/src/targets/cpu/allocation_model.cpp
@@ -11,6 +11,11 @@ operation cpu_allocation_model::allocate(const shape& s) const
    return make_op(name(), {{"shape", to_value(s)}});
 }

+operation cpu_allocation_model::preallocate(const shape& s, const std::string& id) const
+{
+    return make_op("cpu::preallocate", {{"shape", to_value(s)}, {"id", id}});
+}
+
 std::string cpu_allocation_model::copy() const { return "cpu::copy"; }

 } // namespace cpu

--- a/src/targets/cpu/include/migraphx/cpu/allocation_model.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/allocation_model.hpp
@@ -14,6 +14,7 @@ struct cpu_allocation_model
    std::string name() const;
    std::string copy() const;
    operation allocate(const shape& s) const;
+    operation preallocate(const shape& s, const std::string& id) const;
 };

 } // namespace cpu

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -440,7 +440,7 @@ struct cpu_apply
        }
    }

-    instruction_ref apply_pow(instruction_ref ins)
+    instruction_ref apply_pow(instruction_ref ins) const
    {
        auto beta = read_scalar<float>(ins->inputs()[1]);
        if(beta.empty())
@@ -451,7 +451,7 @@ struct cpu_apply
                       {ins->inputs().front()});
    }

-    instruction_ref apply_pooling(instruction_ref ins)
+    instruction_ref apply_pooling(instruction_ref ins) const
    {
        auto&& op = ins->get_operator();
        auto v    = op.to_value();
@@ -479,30 +479,20 @@ struct cpu_apply
        return {r.at<T>()};
    }

-    instruction_ref replace(instruction_ref ins, const operation& op)
+    instruction_ref replace(instruction_ref ins, const operation& op) const
    {
        return replace(ins, op, ins->inputs());
    }

    instruction_ref
-    replace(instruction_ref ins, const operation& op, std::vector<instruction_ref> inputs)
+    replace(instruction_ref ins, const operation& op, std::vector<instruction_ref> inputs) const
    {
        inputs.push_back(insert_allocation(ins, ins->get_shape()));
        return modl->replace_instruction(ins, op, inputs);
    }

-    instruction_ref insert_allocation(instruction_ref ins, const shape& s)
+    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
    {
-        auto ins_alias = instruction::get_output_alias(ins);
-        if(last->name() == "@return" and prog_output_names.count(ins_alias) > 0)
-        {
-            return modl->add_parameter(prog_output_names[ins_alias], s);
-        }
-        else if(ins == last)
-        {
-            return modl->add_parameter("output", s);
-        }
-
        return modl->insert_instruction(ins, make_op("cpu::allocate", {{"shape", to_value(s)}}));
    }
 };

--- a/src/targets/cpu/preallocate.cpp
+++ b/src/targets/cpu/preallocate.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct cpu_preallocate : auto_register_op<cpu_preallocate>
+{
+    shape s;
+    std::string id = "";
+    argument data;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"), f(self.id, "id"));
+    }
+
+    std::string name() const { return "cpu::preallocate"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+    argument compute(context&, const shape&, const std::vector<argument>&) const { return data; }
+    void finalize(context&, const shape&, const std::vector<shape>&) { data = argument(s); }
+    lifetime get_lifetime() const { return lifetime::global; }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -22,6 +22,7 @@
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_reshapes.hpp>
+#include <migraphx/preallocate_param.hpp>
 #include <migraphx/cpu/fuse_ops.hpp>
 #include <migraphx/cpu/write_literals.hpp>
 #include <migraphx/cpu/allocation_model.hpp>
@@ -76,6 +77,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            write_literals{},
            dead_code_elimination{},
            memory_coloring{"cpu::allocate"},
+            dead_code_elimination{},
+            preallocate_param{"scratch", cpu_allocation_model{}},
            dead_code_elimination{}};
 }


--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -61,6 +61,7 @@ add_library(migraphx_device
    device/pad.cpp
    device/pow.cpp
    device/prelu.cpp
+    device/prefix_scan_sum.cpp
    device/recip.cpp
    device/reduce_max.cpp
    device/reduce_mean.cpp
@@ -140,7 +141,6 @@ add_library(migraphx_gpu
    pack_int8_args.cpp
    pad.cpp
    pooling.cpp
-    preallocate_param.cpp
    quant_convolution.cpp
    reverse.cpp
    rnn_variable_seq_lens.cpp
@@ -194,6 +194,7 @@ register_migraphx_gpu_ops(hip_
    pad
    pow
    prelu
+    prefix_scan_sum
    recip
    reduce_max
    reduce_mean

--- a/src/targets/gpu/allocation_model.cpp
+++ b/src/targets/gpu/allocation_model.cpp
@@ -11,6 +11,11 @@ operation gpu_allocation_model::allocate(const shape& s) const
    return make_op(name(), {{"shape", to_value(s)}});
 }

+operation gpu_allocation_model::preallocate(const shape& s, const std::string& id) const
+{
+    return make_op("hip::hip_allocate_memory", {{"shape", to_value(s)}, {"id", id}});
+}
+
 std::string gpu_allocation_model::copy() const { return "hip::copy"; }

 } // namespace gpu

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -5,6 +5,7 @@
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/visit.hpp>
 #include <migraphx/gpu/device/multi_index.hpp>
+#include <migraphx/gpu/device/reduce_ops.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -15,79 +16,6 @@ namespace device {
 #define MIGRAPHX_NO_DPP
 #endif

-struct sum
-{
-    template <class T, class U>
-    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
-    {
-        return x + y;
-    }
-};
-
-struct product
-{
-    template <class T, class U>
-    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
-    {
-        return x * y;
-    }
-};
-
-struct id
-{
-    template <class T>
-    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
-    {
-        return x;
-    }
-};
-
-struct mean
-{
-    size_t item_num = 1;
-    template <class T>
-    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
-    {
-        return x / static_cast<T>(item_num);
-    }
-};
-
-struct max
-{
-    template <class T, class U>
-    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
-    {
-        return (x > y) ? x : y;
-    }
-};
-
-struct min
-{
-    template <class T, class U>
-    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
-    {
-        return (x < y) ? x : y;
-    }
-};
-
-struct lowest
-{
-    template <class T>
-    __device__ __host__ operator T() const
-    {
-        return device_cast(std::numeric_limits<host_type<T>>::lowest());
-    }
-};
-
-struct highest
-{
-    template <class T>
-    __device__ __host__ operator T() const
-    {
-        return device_cast(std::numeric_limits<host_type<T>>::max());
-    }
-};
-
 #ifdef MIGRAPHX_NO_DPP
 template <index_int N,
          class Op,

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce_ops.hpp
+#ifndef MIGRAPHX_GUARD_DEVICE_REDUCE_OPS_HPP
+#define MIGRAPHX_GUARD_DEVICE_REDUCE_OPS_HPP
+
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+struct sum
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x + y;
+    }
+};
+
+struct product
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x * y;
+    }
+};
+
+struct id
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
+    {
+        return x;
+    }
+};
+
+struct mean
+{
+    size_t item_num = 1;
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
+    {
+        return x / static_cast<T>(item_num);
+    }
+};
+
+struct max
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return (x > y) ? x : y;
+    }
+};
+
+struct min
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return (x < y) ? x : y;
+    }
+};
+
+struct lowest
+{
+    template <class T>
+    __device__ __host__ operator T() const
+    {
+        return device_cast(std::numeric_limits<host_type<T>>::lowest());
+    }
+};
+
+struct highest
+{
+    template <class T>
+    __device__ __host__ operator T() const
+    {
+        return device_cast(std::numeric_limits<host_type<T>>::max());
+    }
+};
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_REDUCE_OPS_HPP
--- a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
+#ifndef MIGRAPHX_GUARD_DEVICE_SCAN_HPP
+#define MIGRAPHX_GUARD_DEVICE_SCAN_HPP
+
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/visit.hpp>
+#include <migraphx/gpu/device/multi_index.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <index_int N,
+          class Op,
+          class T,
+          class ForStride,
+          class Input,
+          class Output,
+          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
+__device__ void block_scan(index idx, Op op, T init, ForStride fs, Input input, Output output)
+{
+    using type = decltype(input(deduce_for_stride(fs)));
+    MIGRAPHX_DEVICE_SHARED type buffer[N];
+    type x = init;
+    fs([&](auto i) {
+        if(idx.local == 0)
+            buffer[idx.local] = op(input(i), x);
+        else
+            buffer[idx.local] = input(i);
+        __syncthreads();
+        for(index_int s = 1; s < idx.nlocal(); s *= 2)
+        {
+            if(idx.local + s < idx.nlocal())
+            {
+                buffer[idx.local + s] = op(buffer[idx.local], buffer[idx.local + s]);
+            }
+            __syncthreads();
+        }
+        x = buffer[idx.nlocal() - 1];
+        output(i, buffer[idx.local]);
+    });
+}
+
+template <index_int N, class Op, class T, class Input, class Output>
+__device__ void block_scan(index idx, Op op, T init, index_int n, Input input, Output output)
+{
+    block_scan<N>(idx,
+                  op,
+                  init,
+                  [&](auto f) -> decltype(f(index_int{})) { return idx.local_stride(n, f); },
+                  input,
+                  output);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_SCAN_HPP
--- a/src/targets/gpu/device/prefix_scan_sum.cpp
+++ b/src/targets/gpu/device/prefix_scan_sum.cpp
+#include <migraphx/gpu/device/prefix_scan_sum.hpp>
+#include <migraphx/gpu/device/scan.hpp>
+#include <migraphx/gpu/device/reduce_ops.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void prefix_scan_sum(hipStream_t stream, const argument& result, const argument& arg, int32_t axis)
+{
+    const index_int block_size = 256;
+    const index_int n          = arg.get_shape().lens()[axis];
+    auto rlens                 = result.get_shape().lens();
+    rlens[axis]                = 1;
+    hip_visit_all(result, arg, result.get_shape().with_lens(rlens))(
+        [=](auto output, auto input, auto rshape) {
+            gs_launch(stream, rshape.elements() * block_size, block_size)(
+                [=](auto i, auto idx) __device__ {
+                    const auto ridx  = rshape.multi(i / block_size);
+                    auto compute_idx = [&](auto j) {
+                        auto k  = ridx;
+                        k[axis] = j;
+                        return k;
+                    };
+                    block_scan<block_size>(idx,
+                                           sum{},
+                                           0,
+                                           n,
+                                           [&](auto j) { return input[compute_idx(j)]; },
+                                           [&](auto j, auto x) { output[compute_idx(j)] = x; });
+                });
+        });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
@@ -14,6 +14,7 @@ struct gpu_allocation_model
    std::string name() const;
    std::string copy() const;
    operation allocate(const shape& s) const;
+    operation preallocate(const shape& s, const std::string& id) const;
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
+#ifndef MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
+#define MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void prefix_scan_sum(hipStream_t stream, const argument& result, const argument& arg, int32_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
--- a/src/targets/gpu/include/migraphx/gpu/prefix_scan_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefix_scan_sum.hpp
+#ifndef MIGRAPHX_GUARD_GPU_PREFIX_SCAN_SUM_HPP
+#define MIGRAPHX_GUARD_GPU_PREFIX_SCAN_SUM_HPP
+
+#include <migraphx/gpu/name.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/prefix_scan_sum.hpp>
+#include <migraphx/op/prefix_scan_sum.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/type_name.hpp>
+#include <utility>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_prefix_scan_sum : oper<hip_prefix_scan_sum>
+{
+    op::prefix_scan_sum op;
+
+    template <class Self, class T>
+    static auto reflect(Self& self, T f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        std::vector<shape> in_shapes{inputs};
+        in_shapes.pop_back();
+        check_shapes{in_shapes, *this}.standard();
+        return op.normalize_compute_shape(in_shapes);
+    }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        if(op.exclusive or op.reverse)
+            MIGRAPHX_THROW("Exclusive and reverse scan not supported");
+        device::prefix_scan_sum(ctx.get_stream().get(), args[1], args[0], op.axis);
+        return args[1];
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_PREFIX_SCAN_SUM_HPP
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -163,6 +163,7 @@ struct miopen_apply
        add_extend_op("lrn");
        add_extend_op("pad");
        add_extend_op("pooling");
+        add_extend_op("prefix_scan_sum");
        add_extend_op("reduce_max");
        add_extend_op("reduce_mean");
        add_extend_op("reduce_min");

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -14,6 +14,7 @@
 #include <migraphx/insert_pad.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/normalize_ops.hpp>
+#include <migraphx/preallocate_param.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/remap.hpp>
@@ -31,7 +32,6 @@
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/mlir_conv.hpp>
 #include <migraphx/gpu/pack_int8_args.hpp>
-#include <migraphx/gpu/preallocate_param.hpp>
 #include <migraphx/gpu/schedule_model.hpp>
 #include <migraphx/gpu/sync_device.hpp>
 #include <migraphx/gpu/target.hpp>
@@ -98,7 +98,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}, not enabled(MIGRAPHX_DISABLE_SCHEDULE_PASS{})},
        memory_coloring{"hip::allocate"},
        sync_device{},
-        preallocate_param{"scratch", &ctx},
+        preallocate_param{"scratch", gpu_allocation_model{}},
        dead_code_elimination{},
        eliminate_workspace{},
        eliminate_allocation{"hip::allocate"},

--- a/test/verify/run_verify.cpp
+++ b/test/verify/run_verify.cpp
@@ -109,7 +109,7 @@ std::pair<migraphx::program, std::vector<migraphx::argument>> run_verify::run_ta
    std::transform(
        tres.begin(), tres.end(), res.begin(), [&](auto& argu) { return t.copy_from(argu); });

-    return std::make_pair(p, res);
+    return std::make_pair(std::move(p), res);
 }

 template <class T>

--- a/test/verify/test_prefix_scan_sum_2d.cpp
+++ b/test/verify/test_prefix_scan_sum_2d.cpp
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+struct test_prefix_scan_sum_2d_small : verify_program<test_prefix_scan_sum_2d_small>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {3, 3}};
+        auto x = mm->add_parameter("x", s);
+        mm->add_instruction(
+            migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), x);
+        return p;
+    }
+};
+
+struct test_prefix_scan_sum_2d_large : verify_program<test_prefix_scan_sum_2d_large>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {3, 1000}};
+        auto x = mm->add_parameter("x", s);
+        mm->add_instruction(
+            migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), x);
+        return p;
+    }
+};