Merge remote-tracking branch 'origin/jit-vector-reduce' into transformer_opts

4b83b5a1 · turneram · 7f387483 · c84154b8 · 4b83b5a1 · 4b83b5a1
Commit 4b83b5a1 authored May 20, 2022 by turneram
12 changed files
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -22,6 +22,8 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_FINALIZE)
+
 struct module_impl
 {
    // A list is used to keep references to an instruction stable
@@ -553,8 +555,14 @@ instruction_ref module::find_dangling_reference() const

 void module::finalize(context& ctx)
 {
+    const bool trace = enabled(MIGRAPHX_TRACE_FINALIZE{});
    for(auto ins : iterator_for(*this))
    {
+        if(trace)
+        {
+            std::cout << "Finalize: ";
+            this->debug_print(ins);
+        }
        ins->finalize(ctx);
        for(const auto& smod : ins->module_inputs())
        {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -131,6 +131,7 @@ add_library(migraphx_gpu
    clip.cpp
    code_object_op.cpp
    compile_ops.cpp
+    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
    compiler.cpp

--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace gen {
+
+static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
+{
+    // If all inputs is half then only use half2
+    if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
+           return s.type() == shape::half_type;
+       }))
+        return {2};
+    return {4, 2};
+}
+
+vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+{
+    auto sizes = vector_sizes(inputs);
+    std::vector<std::size_t> max_vec_size;
+    std::transform(inputs.begin(),
+                   inputs.end(),
+                   std::back_inserter(max_vec_size),
+                   [&](const auto& input) -> std::size_t {
+                       auto stride = input.strides()[axis];
+                       auto len    = input.lens()[axis];
+                       if(stride != 0 and stride != 1)
+                           return 1;
+                       if(len == 1 and input.elements() > sizes.front())
+                           return sizes.front();
+                       auto it = std::find_if(
+                           sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
+                       if(it != sizes.end())
+                           return *it;
+                       return 1;
+                   });
+    return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
+}
+
+std::string vectorize::str() const
+{
+    return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
+}
+
+preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
+{
+    const std::size_t max_lds_bytes = 4096;
+    std::vector<bool> result;
+    std::transform(inputs.begin(),
+                   inputs.end(),
+                   std::back_inserter(result),
+                   [&](const shape& input) { return input.strides()[axis] == 0; });
+    auto bytes = std::inner_product(inputs.begin(),
+                                    inputs.end(),
+                                    result.begin(),
+                                    std::size_t{0},
+                                    std::plus<>{},
+                                    [](const shape& s, bool b) -> std::size_t {
+                                        if(b)
+                                            return s.bytes();
+                                        return 0;
+                                    });
+    if(bytes < max_lds_bytes)
+        return {result};
+    // TODO: Try to partially preload items
+    std::fill(result.begin(), result.end(), false);
+    return {result};
+}
+
+std::string preload::str() const
+{
+    std::vector<std::string> bool_strs;
+    std::transform(args.begin(), std::prev(args.end()), std::back_inserter(bool_strs), [](bool b) {
+        if(b)
+            return "true";
+        return "false";
+    });
+    return "auto_preload<false, " + join_strings(bool_strs, ", ") + ">(idx)";
+}
+
+bool preload::is_preloading() const
+{
+    return std::accumulate(args.begin(), args.end(), false, std::logical_or<>{});
+}
+
+std::size_t find_fast_axis(const std::vector<shape>& inputs)
+{
+    auto permutation = find_permutation(inputs);
+    auto it          = std::max_element(permutation.begin(), permutation.end());
+    return it - permutation.begin();
+}
+
+std::string make_transformer_args(std::vector<std::string> transformers)
+{
+    return join_strings(std::move(transformers), ", ");
+}
+
+} // namespace gen
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -119,8 +119,21 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
    };
 }

+std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
+{
+    size_t block_size = 128;
+    while(block_size <= max_block_size and block_size <= n)
+        block_size *= 2;
+    return block_size / 2;
+}
+
 operation compile_hip_code_object(const std::string& content, hip_compile_options options)
 {
+    assert(options.global > 0);
+    assert(options.local > 0);
+    assert(not options.inputs.empty());
+    assert(options.inputs.size() == options.virtual_inputs.size() or
+           options.virtual_inputs.empty());
    std::vector<src_file> srcs;
    std::transform(migraphx_kernels().begin(),
                   migraphx_kernels().end(),

--- a/src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_gen.hpp
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
+
+#include <migraphx/config.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct shape;
+
+namespace gpu {
+namespace gen {
+
+struct vectorize
+{
+    std::size_t size = 1;
+    std::size_t axis = 0;
+    static vectorize elements(std::size_t axis, const std::vector<shape>& inputs);
+    std::string str() const;
+};
+struct preload
+{
+    std::vector<bool> args = {};
+    static preload broadcasts(std::size_t axis, const std::vector<shape>& inputs);
+    bool is_preloading() const;
+    std::string str() const;
+};
+
+std::size_t find_fast_axis(const std::vector<shape>& inputs);
+
+std::string make_transformer_args(std::vector<std::string> transformers);
+
+template <class... Ts>
+std::string make_transformer_args(Ts... xs)
+{
+    return make_transformer_args({xs.str()...});
+}
+
+} // namespace gen
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -46,6 +46,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);

 operation compile_hip_code_object(const std::string& content, hip_compile_options options);

+std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024);
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>

 #include <migraphx/cpp_generator.hpp>
 #include <migraphx/ranges.hpp>
@@ -17,6 +18,8 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+using namespace migraphx::gpu::gen; // NOLINT
+
 static const char* const pointwise_kernel = R"__migraphx__(
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/pointwise.hpp>
@@ -30,7 +33,7 @@ extern "C" {
 __global__ void ${kernel}(${params}) 
 {
    auto idx = make_index();
-    pointwise(idx, auto_preload<${preloads}>(idx), vectorize<${vec_size}, ${axis}>())(${lambda}, ${args});
+    pointwise(idx, ${transformers})(${lambda}, ${args});
 }
    
 }
@@ -62,75 +65,6 @@ struct pointwise_compiler : compiler<pointwise_compiler>
        else
            return 1;
    }
-    static std::size_t find_fast_axis(const std::vector<shape>& inputs)
-    {
-        auto permutation = find_permutation(inputs);
-        auto it          = std::max_element(permutation.begin(), permutation.end());
-        return it - permutation.begin();
-    }
-    static std::vector<bool> preload(std::size_t axis, const std::vector<shape>& inputs)
-    {
-        const std::size_t max_lds_bytes = 4096;
-        std::vector<bool> result;
-        std::transform(inputs.begin(),
-                       inputs.end(),
-                       std::back_inserter(result),
-                       [&](const shape& input) { return input.strides()[axis] == 0; });
-        auto bytes = std::inner_product(inputs.begin(),
-                                        inputs.end(),
-                                        result.begin(),
-                                        std::size_t{0},
-                                        std::plus<>{},
-                                        [](const shape& s, bool b) -> std::size_t {
-                                            if(b)
-                                                return s.bytes();
-                                            return 0;
-                                        });
-        if(bytes < max_lds_bytes)
-            return result;
-        // TODO: Try to partially preload items
-        std::fill(result.begin(), result.end(), false);
-        return result;
-    }
-    static std::string preload_str(const std::vector<bool>& bs)
-    {
-        std::vector<std::string> bool_strs;
-        std::transform(bs.begin(), std::prev(bs.end()), std::back_inserter(bool_strs), [](bool b) {
-            if(b)
-                return "true";
-            return "false";
-        });
-        return "false, " + join_strings(bool_strs, ", ");
-    }
-    static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
-    {
-        // If all inputs is half then only use half2
-        if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
-               return s.type() == shape::half_type;
-           }))
-            return {2};
-        return {4, 2};
-    }
-    static auto vectorize_elements(std::size_t axis, const std::vector<shape>& inputs)
-    {
-        auto sizes = vector_sizes(inputs);
-        std::vector<std::size_t> max_vec_size;
-        std::transform(inputs.begin(),
-                       inputs.end(),
-                       std::back_inserter(max_vec_size),
-                       [&](const auto& input) -> std::size_t {
-                           auto stride = input.strides()[axis];
-                           auto len    = input.lens()[axis];
-                           if(stride != 0 and stride != 1)
-                               return 1;
-                           auto it = std::find_if(
-                               sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
-                           if(it != sizes.end())
-                               return *it;
-                           return 1;
-                       });
-        return *std::min_element(max_vec_size.begin(), max_vec_size.end());
-    }
    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
    {
        hip_compile_options options;
@@ -139,23 +73,19 @@ struct pointwise_compiler : compiler<pointwise_compiler>
        options.virtual_inputs = reduce_dims(inputs);
        options.params         = "-Wno-float-equal";
        auto axis              = find_fast_axis(options.virtual_inputs);
-        auto vec_size          = vectorize_elements(axis, options.virtual_inputs);
-        auto preloads          = preload(axis, options.virtual_inputs);
-        auto is_preloading =
-            std::accumulate(preloads.begin(), preloads.end(), false, std::logical_or<>{});
-        options.kernel_name = v.get("kernel", "kernel");
-        options.set_launch_params(v,
-                                  compute_global_for(ctx,
-                                                     options.output.elements() / vec_size,
-                                                     oversubscribe_if(not is_preloading)));
+        auto vec               = vectorize::elements(axis, options.virtual_inputs);
+        auto preloads          = preload::broadcasts(axis, options.virtual_inputs);
+        options.set_launch_params(
+            v,
+            compute_global_for(ctx,
+                               options.output.elements() / vec.size,
+                               oversubscribe_if(not preloads.is_preloading())));
        auto src = interpolate_string(pointwise_kernel,
                                      {{"kernel", options.kernel_name},
                                       {"params", enum_params(inputs.size(), "void * private_p")},
                                       {"args", enum_params(inputs.size(), "private_p")},
                                       {"lambda", v.at("lambda").to<std::string>()},
-                                       {"vec_size", std::to_string(vec_size)},
-                                       {"axis", std::to_string(axis)},
-                                       {"preloads", preload_str(preloads)},
+                                       {"transformers", make_transformer_args(preloads, vec)},
                                       {"preamble", v.get("preamble", std::string{})}});
        return compile_hip_code_object(src, options);
    }

--- a/src/targets/gpu/jit/reduce.cpp
+++ b/src/targets/gpu/jit/reduce.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>

 #include <migraphx/cpp_generator.hpp>
 #include <migraphx/ranges.hpp>
@@ -16,9 +17,12 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+using namespace migraphx::gpu::gen; // NOLINT
+
 static const char* const simple_reduce_kernel = R"__migraphx__(
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/reduce.hpp>
+#include <migraphx/kernels/vectorize.hpp>
 #include <args.hpp>

 namespace migraphx {
@@ -26,9 +30,10 @@ namespace migraphx {
 ${preamble}

 extern "C" {
-__global__ void kernel(void* input_p, void* output_p) 
+__global__ void reduce_kernel(void* input_p, void* output_p) 
 {
-    make_tensors()(input_p, output_p)([](auto input, auto output) {
+    
+    transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {

        simple_reduce<reduce::${algo}>(${reduction}, ${init}, input, output, ${read}, ${write});
    });
@@ -40,14 +45,6 @@ __global__ void kernel(void* input_p, void* output_p)

 )__migraphx__";

-constexpr std::size_t compute_block_size(std::size_t n, std::size_t max_block_size = 1024)
-{
-    size_t block_size = 128;
-    while(block_size <= max_block_size and block_size <= n)
-        block_size *= 2;
-    return block_size / 2;
-}
-
 static std::size_t get_reduce_elements(const std::vector<shape>& inputs)
 {
    return inputs.front().elements() / inputs.back().elements();
@@ -101,32 +98,42 @@ struct reduce_compiler : compiler<reduce_compiler>
    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
    {
        hip_compile_options options;
-        auto reduce_elements = get_reduce_elements(inputs);
-        auto algo            = v.get("algo", get_reduce_algo(inputs));
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(inputs);
+        auto faxis             = find_fast_axis({options.virtual_inputs.front()});
+        vectorize vec{};
+        // Vectorize if the axis is a reduction axis
+        if(options.virtual_inputs.back().lens()[faxis] == 1)
+        {
+            vec = vectorize::elements(faxis, options.virtual_inputs);
+        }
+        auto relements = get_reduce_elements(options.virtual_inputs) / vec.size;
+        auto nelements = options.virtual_inputs.back().elements();
+        auto algo      = v.get("algo", get_reduce_algo(options.virtual_inputs));
        if(algo == "block")
        {
-            auto block_size = compute_block_size(reduce_elements, 256);
+            auto block_size = compute_block_size(relements, 256);
            options.set_launch_params(
-                v, compute_global_for(ctx, inputs.back().elements() * block_size, 256), block_size);
+                v, compute_global_for(ctx, nelements * block_size, 256), block_size);
        }
        else if(algo == "lane")
        {
-            options.set_launch_params(v, compute_global_for(ctx, inputs.back().elements(), 256));
+            options.set_launch_params(v, compute_global_for(ctx, nelements, 256));
        }
        else
        {
            MIGRAPHX_THROW("Unknown reduce algo: " + algo);
        }
-        options.inputs         = inputs;
-        options.output         = inputs.back();
-        options.virtual_inputs = reduce_dims(inputs);
-        std::string identity   = "[](auto x) { return x; }";
-        auto src               = interpolate_string(simple_reduce_kernel,
+        options.kernel_name  = "reduce_kernel";
+        std::string identity = "[](auto x) { return x; }";
+        auto src             = interpolate_string(simple_reduce_kernel,
                                      {{"reduction", v.at("reduction").to<std::string>()},
                                       {"init", v.get("init", std::string{"0"})},
                                       {"read", v.get("read", identity)},
                                       {"write", v.get("write", identity)},
                                       {"algo", algo},
+                                       {"transformers", make_transformer_args(vec)},
                                       {"preamble", v.get("preamble", std::string{})}});
        options.params += "-Wno-float-equal";
        return compile_hip_code_object(src, options);

--- a/src/targets/gpu/kernel.cpp
+++ b/src/targets/gpu/kernel.cpp
@@ -59,6 +59,8 @@ void launch_kernel(hipFunction_t fun,
                   void* kernargs,
                   std::size_t size)
 {
+    assert(global > 0);
+    assert(local > 0);
    void* config[] = {
 // HIP_LAUNCH_PARAM_* are macros that do horrible things
 #ifdef MIGRAPHX_USE_CLANG_TIDY

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -178,9 +178,12 @@ struct block
        __device__ auto reduce(Op op, T init, Read read) const
        {
            return sliced(slicer, [=](auto x, auto... xs) {
-                return block_reduce(idx, op, init, x.get_shape().elements(), [&](auto j) {
-                    return read(x[j], xs[j]...);
-                });
+                return vec_reduce(block_reduce(idx,
+                                               op,
+                                               init,
+                                               x.get_shape().elements(),
+                                               [&](auto j) { return read(x[j], xs[j]...); }),
+                                  op);
            });
        }


--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -147,5 +147,19 @@ constexpr auto vec_packed_transform(Ts... xs)
    };
 }

+template <class T, class Op>
+constexpr auto vec_reduce(T x, Op op)
+{
+    if constexpr(vec_size<T>() < 2)
+        return x;
+    else
+    {
+        vec_type<T> result = x[0];
+        for(int i = 1; i < vec_size<T>(); i++)
+            result = op(result, x[i]);
+        return result;
+    }
+}
+
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -213,7 +213,9 @@ template <index_int N, index_int Axis, class T>
 __device__ __host__ auto vectorize_tensor(T x)
 {
    constexpr auto shape = get_shape_c<T>{};
-    if constexpr(shape.strides[Axis] == 0)
+    if constexpr(shape.lens[Axis] == 1)
+        return x;
+    else if constexpr(shape.strides[Axis] == 0)
        return tensor_step<N>(x, _c<Axis>);
    else
        return as_vec<N>(x, _c<Axis>);