Merge

7702c20d · Paul · c362e7fa · 9afce86d · 7702c20d · 7702c20d
Commit 7702c20d authored Aug 19, 2022 by Paul
20 changed files
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/fuse_mlir.hpp>
+#include <migraphx/gpu/mlir.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+#ifdef MIGRAPHX_MLIR
+struct mlir_conv
+{
+    operation op = make_op("convolution");
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::mlir_conv"; }
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        check_shapes{inputs, *this}.standard();
+        if(mods.size() != 1)
+            MIGRAPHX_THROW("should have one submodule.");
+        if(inputs.size() < 2)
+            MIGRAPHX_THROW("should have at least two inputs.");
+        auto n = inputs.size();
+        return op.compute_shape({inputs[n - 2], inputs[n - 1]});
+    }
+};
+MIGRAPHX_REGISTER_OP(mlir_conv);
+
+namespace {
+struct find_conv_pointwise
+{
+    // Find a convolution followed by a pointwise operation.
+    auto matcher() const
+    {
+        auto convolution =
+            match::skip(match::name("contiguous"))(match::name("convolution").bind("convolution"));
+        return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto conv_ins = r.instructions["convolution"];
+        auto x_ins    = r.instructions["x"]; // input after contiguous
+        auto* pm      = ins->module_inputs().front();
+        auto names    = pm->get_parameter_names();
+        // Whitelist pointwise operators
+        if(std::any_of(pm->begin(), pm->end(), [](const auto& i) {
+               return not contains({"@literal", "@param", "@return", "convolution", "add", "relu"},
+                                   i.name());
+           }))
+            return;
+        // Only fuse with fp32 for now
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
+               return i->get_shape().type() != shape::type_t::float_type;
+           }))
+            return;
+        std::sort(names.begin(), names.end());
+        module_ref mm = mpm.create_module("mlir_" + pm->name());
+        mm->set_bypass();
+        std::unordered_map<instruction_ref, instruction_ref> param_map;
+        auto x    = mm->add_parameter("x" + std::to_string(names.size()),
+                                   conv_ins->inputs().at(0)->get_shape());
+        auto w    = mm->add_parameter("x" + std::to_string(names.size() + 1),
+                                   conv_ins->inputs().at(1)->get_shape());
+        auto conv = mm->add_instruction(conv_ins->get_operator(), {x, w});
+        std::transform(names.begin(),
+                       names.end(),
+                       ins->inputs().begin(),
+                       std::inserter(param_map, param_map.end()),
+                       [&](auto name, auto input) {
+                           if(input == x_ins)
+                               return std::make_pair(pm->get_parameter(name), conv);
+                           return std::make_pair(pm->get_parameter(name),
+                                                 mm->add_parameter(name, input->get_shape()));
+                       });
+        mm->add_return(mm->insert_instructions(mm->end(), pm, param_map));
+
+        std::vector<instruction_ref> inputs;
+        std::copy_if(ins->inputs().begin(),
+                     ins->inputs().end(),
+                     std::back_inserter(inputs),
+                     [&](auto input) { return input != conv_ins; });
+        inputs.insert(inputs.end(), conv_ins->inputs().begin(), conv_ins->inputs().end());
+        mpm.get_module().replace_instruction(
+            ins, mlir_conv{conv_ins->get_operator()}, inputs, {mm});
+    }
+};
+} // namespace
+
+#endif
+
+void fuse_mlir::apply(module_pass_manager& mpm) const
+{
+#ifdef MIGRAPHX_MLIR
+    match::find_matches(mpm, find_conv_pointwise{});
+#else
+    (void)mpm;
+#endif
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -336,6 +336,7 @@ void move_standard_front(std::vector<instruction_ref>& args)

 auto gpu_name(const std::string& s) { return match::name("gpu::" + s); }

+namespace {
 struct find_layernorm
 {
    auto matcher() const { return match::layernorm(&gpu_name); }
@@ -837,15 +838,6 @@ inline auto precompile_name(Strings... names) // NOLINT
    });
 }

-template <class... Ms>
-auto conv_bias_pointwise(Ms... ms)
-{
-    return precompile_name("pointwise")(
-        match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
-                                fusable_conv(match::used_once()).bind("conv")),
-        ms...);
-}
-
 struct find_conv_bias
 {
    context* ctx = nullptr;
@@ -1014,6 +1006,7 @@ struct find_commutative_broadcast
        m.replace_instruction(ins, ins->get_operator(), args);
    }
 };
+} // namespace

 struct find_contiguous
 {

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -23,13 +23,13 @@
 */

 #include <migraphx/gpu/hip.hpp>
-
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device/contiguous.hpp>
 #include <miopen/miopen.h>
-
+#include <memory>
+#include <mutex>
 #include <vector>

 namespace migraphx {
@@ -77,12 +77,38 @@ void* get_device_ptr(void* hptr)
    return result;
 }

-hip_ptr allocate_gpu(std::size_t sz, bool host = false)
+struct host_ptr_cache
+{
+    std::unordered_map<void*, std::weak_ptr<void>> cache;
+    std::mutex m;
+    std::shared_ptr<void> get(void* ptr)
+    {
+        std::lock_guard<std::mutex> lock(m);
+        auto it = cache.find(ptr);
+        if(it != cache.end())
+            return it->second.lock();
+        return nullptr;
+    }
+
+    void put(const std::shared_ptr<void>& p)
+    {
+        std::lock_guard<std::mutex> lock(m);
+        cache[p.get()] = p;
+    }
+};
+
+static host_ptr_cache& get_host_ptr_cache()
+{
+    static host_ptr_cache cache;
+    return cache;
+}
+
+std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host = false)
 {
    if(sz > get_available_gpu_memory())
        MIGRAPHX_THROW("Memory not available to allocate buffer: " + std::to_string(sz));
-    void* result = nullptr;
-    auto status  = host ? hipHostMalloc(&result, sz) : hipMalloc(&result, sz);
+    void* alloc_ptr = nullptr;
+    auto status     = host ? hipHostMalloc(&alloc_ptr, sz) : hipMalloc(&alloc_ptr, sz);
    if(status != hipSuccess)
    {
        if(host)
@@ -90,16 +116,28 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
        else
            return allocate_gpu(sz, true);
    }
-    assert(result != nullptr);
-    return hip_ptr{result};
+    assert(alloc_ptr != nullptr);
+    std::shared_ptr<void> result = share(hip_ptr{alloc_ptr});
+    if(host)
+    {
+        get_host_ptr_cache().put(result);
+    }
+    return result;
 }

-hip_host_ptr register_on_gpu(void* ptr, std::size_t sz)
+std::shared_ptr<void> register_on_gpu(void* ptr, std::size_t sz)
 {
+    std::shared_ptr<void> result = get_host_ptr_cache().get(ptr);
+    if(result)
+    {
+        return result;
+    }
    auto status = hipHostRegister(ptr, sz, hipHostRegisterMapped);
    if(status != hipSuccess)
        MIGRAPHX_THROW("Gpu register failed: " + hip_error(status));
-    return hip_host_ptr{ptr};
+    result = share(hip_host_ptr{ptr});
+    get_host_ptr_cache().put(result);
+    return result;
 }

 template <class T>
@@ -115,7 +153,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
    return result;
 }

-hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
+std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host = false)
 {
    gpu_sync();
    auto result = allocate_gpu(sz, host);
@@ -137,22 +175,21 @@ hip_ptr write_to_gpu(const T& x)

 argument allocate_gpu(const shape& s, bool host)
 {
-    auto p = share(allocate_gpu(s.bytes() + 1, host));
+    auto p = allocate_gpu(s.bytes() + 1, host);
    return {s, [p]() mutable { return reinterpret_cast<char*>(p.get()); }};
 }

 argument register_on_gpu(const argument& arg)
 {
    auto arg_shared = arg.share();
-    auto p          = share(register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes()));
-    return {arg_shared.get_shape(), [p, a = std::move(arg_shared)]() mutable {
-                return get_device_ptr(p.get());
-            }}; // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
+    auto p          = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes());
+    return {arg_shared.get_shape(),
+            [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
+}

 argument to_gpu(const argument& arg, bool host)
 {
-    auto p = share(write_to_gpu(arg.data(), arg.get_shape().bytes(), host));
+    auto p = write_to_gpu(arg.data(), arg.get_shape().bytes(), host);
    return {arg.get_shape(), p};
 }


--- a/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
@@ -38,12 +38,13 @@ struct context;

 struct code_object_op
 {
-    value::binary code_object;
-    std::string symbol_name;
-    std::size_t global;
-    std::size_t local;
-    std::vector<shape> expected_inputs;
-    shape output;
+    value::binary code_object{};
+    std::string symbol_name = "";
+    std::size_t global      = 0;
+    std::size_t local       = 0;
+    std::vector<shape> expected_inputs{};
+    shape output{};
+    std::int64_t output_arg = -1;
    kernel k{};

    template <class Self, class F>
@@ -66,9 +67,13 @@ struct code_object_op
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
    void finalize(context&, const shape&, const std::vector<shape>&);
+    std::int64_t get_output_arg(std::size_t n) const
+    {
+        return output_arg < 0 ? n + output_arg : output_arg;
+    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
-        return shapes.size() - 1;
+        return get_output_arg(shapes.size());
    }

    friend std::ostream& operator<<(std::ostream& os, const code_object_op& op)

--- a/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
@@ -39,20 +39,20 @@ struct miopen_deconvolution
    op::deconvolution op;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
-    miopenHandle_t handle = nullptr;
+    uint64_t solution_id = 0;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        // TODO: Add algo
-        return op::convolution::reflect(self.op, f);
+        return pack_join(op::deconvolution::reflect(self.op, f),
+                         pack(f(self.solution_id, "solution_id")));
    }

    std::string name() const { return "gpu::deconv"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {

--- a/src/targets/gpu/include/migraphx/gpu/device_name.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device_name.hpp
@@ -33,6 +33,8 @@ namespace gpu {

 std::string get_device_name();

+int get_device_id();
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/mlir_conv.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir_conv.hpp
@@ -21,8 +21,8 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_MIOPEN_MLIR_CONV_HPP
-#define MIGRAPHX_GUARD_RTGLIB_MIOPEN_MLIR_CONV_HPP
+#ifndef MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
+#define MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP

 #include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -30,18 +30,19 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-struct module;
+struct module_pass_manager;

 namespace gpu {
-struct mlir_conv
+
+struct fuse_mlir
 {
-    context* ctx;
-    std::string name() const { return "mlir::convolution"; }
-    void apply(module& m) const;
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::fuse_mlir"; }
+    void apply(module_pass_manager& mpm) const;
 };

 } // namespace gpu
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
+#endif // MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
--- a/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
 #define MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP

+#include <migraphx/argument.hpp>
 #include <migraphx/op/quant_dot.hpp>
 #include <migraphx/config.hpp>
 #include <utility>

--- a/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP
 #define MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP

+#include <migraphx/argument.hpp>
 #include <migraphx/op/quant_dot.hpp>
 #include <migraphx/config.hpp>
 #include <utility>

--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_MLIR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_MLIR_HPP
+
+#include <string>
+#include <vector>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/instruction_ref.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+namespace gpu {
+
+std::string dump_mlir(const module& m);
+code_object_op compile_mlir(const context& ctx, const module& m);
+
+instruction_ref insert_mlir(module& m,
+                            instruction_ref ins,
+                            code_object_op co,
+                            const std::vector<instruction_ref>& inputs);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_PERFDB_HPP
+#define MIGRAPHX_GUARD_GPU_PERFDB_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/operation.hpp>
+#include <string>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct problem_params
+{
+    operation op;
+    std::vector<shape> inputs;
+    shape output;
+};
+
+std::string get_mlir_perf_for_conv(const problem_params& pp);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_PERFDB_HPP
--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -41,7 +41,7 @@ struct miopen_quant_convolution
    bool int8_x4_format = false;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
-    miopenHandle_t handle = nullptr;
+    uint64_t solution_id = 0;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -55,7 +55,7 @@ struct miopen_quant_convolution
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {

--- a/src/targets/gpu/jit/layernorm.cpp
+++ b/src/targets/gpu/jit/layernorm.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
@@ -61,7 +84,7 @@ struct layernorm_compiler : compiler<layernorm_compiler>
        }
        auto preloads   = preload::broadcasts(axis, inputs);
        auto relements  = inputs[0].lens()[axis] / vec.size;
-        auto nelements  = inputs.back().elements() / inputs[0].lens()[axis];
+        auto nelements  = (inputs.back().elements() / inputs[0].lens()[axis]);
        auto block_size = compute_block_size(relements, 256);
        hip_compile_options options;
        options.set_launch_params(

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/gpu/mlir.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct mlir_compiler : compiler<mlir_compiler>
+{
+    std::vector<std::string> names() const { return {"gpu::mlir_conv"}; }
+
+    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation&) const
+    {
+        auto* smod = ins->module_inputs().front();
+        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
+        return insert(compile_mlir(ctx, *smod));
+    }
+
+    compiler_replace insert(code_object_op co) const
+    {
+        return [co = std::move(co)](module& m, instruction_ref ins) {
+            auto mlir = insert_mlir(m, ins, co, ins->inputs());
+            m.replace_instruction(ins, mlir);
+        };
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/jit/softmax.cpp
+++ b/src/targets/gpu/jit/softmax.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
@@ -52,12 +75,12 @@ struct softmax_compiler : compiler<softmax_compiler>
        auto faxis = find_fast_axis({inputs.front()});
        vectorize vec{};
        // Vectorize if the axis is a reduction axis
-        if(inputs.back().lens()[faxis] == 1)
+        if(faxis == axis)
        {
            vec = vectorize::elements(faxis, inputs);
        }
        auto relements  = inputs[0].lens()[axis] / vec.size;
-        auto nelements  = inputs.back().elements() / relements;
+        auto nelements  = (inputs.back().elements() / inputs[0].lens()[axis]);
        auto block_size = compute_block_size(relements, 256);
        hip_compile_options options;
        options.set_launch_params(

--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -27,6 +27,7 @@
 #include <migraphx/kernels/hip.hpp>
 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/type_traits.hpp>

 namespace migraphx {

@@ -53,29 +54,51 @@ struct index
        return blockDim.x; // NOLINT
    }
 #endif
+    template <class N, class Stride>
+    static constexpr auto max_stride_iterations(N n, Stride stride)
+    {
+        return (n - _c<1>) / stride + _c<1>;
+    }

-    template <class F>
-    __device__ void global_stride(index_int n, F f) const
+    template <class F, class N, class Stride>
+    static constexpr void for_stride(index_int start, N n, Stride stride, F f)
    {
-        const auto stride = nglobal();
-        for(index_int i = global; i < n; i += stride)
+        if constexpr(not is_integral<N>{} and not is_integral<Stride>{} and
+                     max_stride_iterations(n, stride) == 1)
        {
-            f(i);
+            if constexpr(stride > n)
+            {
+                if(start < n)
+                    f(start);
+            }
+            else
+            {
+                f(start);
+            }
+        }
+        else
+        {
+            for(index_int i = start; i < n; i += stride)
+            {
+                f(i);
+            }
        }
    }

-    template <class F>
-    __device__ void local_stride(index_int n, F f) const
+    template <class F, class N>
+    __device__ void global_stride(N n, F f) const
    {
-        const auto stride = nlocal();
-        for(index_int i = local; i < n; i += stride)
-        {
-            f(i);
-        }
+        for_stride(global, n, nglobal(), f);
+    }
+
+    template <class F, class N>
+    __device__ void local_stride(N n, F f) const
+    {
+        for_stride(local, n, nlocal(), f);
    }
 };

-inline __device__ index make_index()
+inline __device__ __attribute__((const)) index make_index()
 {
    return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
 #ifndef MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
 #define MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
 #include <migraphx/kernels/reduce.hpp>
@@ -23,11 +46,9 @@ __device__ void generic_binary_layernorm(
    F compute, BinOp op, Output output, Input1 input1, Input2 input2, Inputs... inputs)
 {
    using reduce_output = reduce::with_axis<Input1, Axis>;
-    constexpr auto relements =
-        get_shape_c<Input1>{}.elements() / get_shape_c<reduce_output>{}.elements();
-    MIGRAPHX_ASSERT(relements > 0);
    reduce::block::run<reduce_output>([&](auto, auto r) {
        using value_type = typename Input1::type;
+        constexpr auto relements = r.template elements<Input1>();
        auto means       = r.reduce(op::sum{}, make_array<value_type>(0, 0), [&](auto x1, auto x2) {
            auto x = op(x1, x2);
            return make_array(x, x * x) / value_type{relements};

--- a/src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
@@ -90,7 +90,7 @@ struct lowest
    template <class T>
    constexpr operator T() const
    {
-        return numeric_lowest<T>();
+        return numeric_lowest<vec_type<T>>();
    }
 };

@@ -99,7 +99,7 @@ struct highest
    template <class T>
    constexpr operator T() const
    {
-        return numeric_max<T>();
+        return numeric_max<vec_type<T>>();
    }
 };
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -186,7 +186,8 @@ __device__ auto auto_preload(index idx)
 {
    return make_transform([=](auto f, auto... xs) {
        auto invoke = [=](auto... ys) {
-            __syncthreads();
+            if constexpr((Bs or ...))
+                __syncthreads();
            f(ys...);
        };
        join(invoke, preload_copy<Bs>(idx, xs)...);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -224,6 +224,18 @@ struct block
                idx.local_stride(x.get_shape().elements(), [&](auto j) { f(x[j], xs[j]...); });
            });
        }
+
+        template <class Input>
+        constexpr auto elements() const
+        {
+            using reduce_type        = decltype(slicer(Input{}));
+            using value_type         = typename Input::type;
+            constexpr auto relements = get_shape_c<reduce_type>{}.elements();
+            if constexpr(vec_size<value_type>() > 1)
+                return relements * vec_size<value_type>();
+            else
+                return relements;
+        }
    };

    template <class Slicer>
@@ -281,6 +293,13 @@ struct lane
                }
            });
        }
+
+        template <class Input>
+        constexpr auto elements() const
+        {
+            using reduce_type = decltype(slicer(Input{}));
+            return get_shape_c<reduce_type>{}.elements();
+        }
    };

    template <class Slicer>