Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into add-conv_bn_add-test

4a39a0f7 · Shucai Xiao · 5564172e · bb827865 · 4a39a0f7 · 4a39a0f7
Commit 4a39a0f7 authored Oct 11, 2021 by Shucai Xiao
20 changed files
--- a/src/targets/cpu/layernorm.cpp
+++ b/src/targets/cpu/layernorm.cpp
@@ -31,7 +31,7 @@ struct dnnl_layernorm : dnnl_op<dnnl_layernorm, dnnl::layer_normalization_forwar
    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        return {dnnl::prop_kind::forward_inference,
-                m.at(DNNL_ARG_SRC),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
                1e-12f,
                dnnl::normalization_flags::none};
    }

--- a/src/targets/cpu/logsoftmax.cpp
+++ b/src/targets/cpu/logsoftmax.cpp
@@ -12,7 +12,7 @@ struct dnnl_logsoftmax : dnnl_extend_op<dnnl_logsoftmax, dnnl::logsoftmax_forwar
    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        int axis = this->op.axis;
-        return {dnnl::prop_kind::forward_inference, m.at(DNNL_ARG_SRC_0), axis};
+        return {dnnl::prop_kind::forward_inference, m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)), axis};
    }
 };


--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -66,7 +66,10 @@ struct cpu_im2col
    }

    static std::string name() { return "cpu::im2col"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        return op.normalize_compute_shape(inputs);
+    }

    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
@@ -389,8 +392,10 @@ struct cpu_apply
        extend_op("concat", "dnnl::concat");
        extend_op("contiguous", "dnnl::reorder");
        extend_op("convolution", "dnnl::convolution");
+#ifndef MIGRAPHX_ENABLE_ZENDNN
        extend_op("deconvolution", "dnnl::deconvolution");
        extend_op("dot", "dnnl::dot");
+#endif
        extend_op("erf", "cpu::erf");
        extend_op("gather", "cpu::gather");
        extend_op("logsoftmax", "dnnl::logsoftmax");
@@ -437,7 +442,7 @@ struct cpu_apply
        }
    }

-    instruction_ref apply_pow(instruction_ref ins)
+    instruction_ref apply_pow(instruction_ref ins) const
    {
        auto beta = read_scalar<float>(ins->inputs()[1]);
        if(beta.empty())
@@ -448,7 +453,7 @@ struct cpu_apply
                       {ins->inputs().front()});
    }

-    instruction_ref apply_pooling(instruction_ref ins)
+    instruction_ref apply_pooling(instruction_ref ins) const
    {
        auto&& op = ins->get_operator();
        auto v    = op.to_value();
@@ -476,30 +481,20 @@ struct cpu_apply
        return {r.at<T>()};
    }

-    instruction_ref replace(instruction_ref ins, const operation& op)
+    instruction_ref replace(instruction_ref ins, const operation& op) const
    {
        return replace(ins, op, ins->inputs());
    }

    instruction_ref
-    replace(instruction_ref ins, const operation& op, std::vector<instruction_ref> inputs)
+    replace(instruction_ref ins, const operation& op, std::vector<instruction_ref> inputs) const
    {
        inputs.push_back(insert_allocation(ins, ins->get_shape()));
        return modl->replace_instruction(ins, op, inputs);
    }

-    instruction_ref insert_allocation(instruction_ref ins, const shape& s)
+    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
    {
-        auto ins_alias = instruction::get_output_alias(ins);
-        if(last->name() == "@return" and prog_output_names.count(ins_alias) > 0)
-        {
-            return modl->add_parameter(prog_output_names[ins_alias], s);
-        }
-        else if(ins == last)
-        {
-            return modl->add_parameter("output", s);
-        }
-
        return modl->insert_instruction(ins, make_op("cpu::allocate", {{"shape", to_value(s)}}));
    }
 };

--- a/src/targets/cpu/lrn.cpp
+++ b/src/targets/cpu/lrn.cpp
@@ -12,7 +12,7 @@ struct dnnl_lrn : dnnl_extend_op<dnnl_lrn, dnnl::lrn_forward, op::lrn>
    {
        return {dnnl::prop_kind::forward_inference,
                dnnl::algorithm::lrn_across_channels,
-                m.at(DNNL_ARG_SRC_0),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)),
                this->op.size,
                this->op.alpha,
                this->op.beta,

--- a/src/targets/cpu/pooling.cpp
+++ b/src/targets/cpu/pooling.cpp
@@ -63,7 +63,7 @@ struct cpu_pooling : auto_register_op<cpu_pooling<Op>>
    shape compute_shape(std::vector<shape> inputs) const
    {
        inputs.pop_back();
-        return op.compute_shape(inputs);
+        return op.normalize_compute_shape(inputs);
    }

    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
@@ -125,19 +125,22 @@ template struct cpu_pooling<max_pool>;

 struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_forward, op::pooling>
 {
-    std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC}; }
+    std::vector<int> arg_map(int) const { return {MIGRAPHX_DNNL_PREFIX(ARG_SRC)}; }

    dnnl::pooling_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
-        auto algo = op.mode == "max" ? dnnl::algorithm::pooling_max : dnnl::algorithm::pooling_avg;
+        auto algo  = op.mode == "max" ? dnnl::algorithm::pooling_max : dnnl::algorithm::pooling_avg;
+        auto kdims = op.kdims();
+        std::vector<size_t> padding_l(op.padding.begin(), op.padding.begin() + kdims);
+        std::vector<size_t> padding_r(op.padding.begin() + kdims, op.padding.end());
        return {dnnl::prop_kind::forward_inference,
                algo,
-                m.at(DNNL_ARG_SRC),
-                m.at(DNNL_ARG_DST),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
                to_dnnl_dims(op.stride),
                to_dnnl_dims(op.lengths),
-                to_dnnl_dims(op.padding),
-                to_dnnl_dims(op.padding)};
+                to_dnnl_dims(padding_l),
+                to_dnnl_dims(padding_r)};
    }
 };


--- a/src/targets/cpu/preallocate.cpp
+++ b/src/targets/cpu/preallocate.cpp
+#include <migraphx/config.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/cpu/context.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {
+
+struct cpu_preallocate : auto_register_op<cpu_preallocate>
+{
+    shape s;
+    std::string id = "";
+    argument data;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"), f(self.id, "id"));
+    }
+
+    std::string name() const { return "cpu::preallocate"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(0);
+        return s;
+    }
+    argument compute(context&, const shape&, const std::vector<argument>&) const { return data; }
+    void finalize(context&, const shape&, const std::vector<shape>&) { data = argument(s); }
+    lifetime get_lifetime() const { return lifetime::global; }
+};
+
+} // namespace cpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/reduction.cpp
+++ b/src/targets/cpu/reduction.cpp
@@ -37,7 +37,11 @@ struct dnnl_reduction : dnnl_op<dnnl_reduction, dnnl::reduction>

    dnnl::reduction::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
-        return {to_dnnl_algo(algo), m.at(DNNL_ARG_SRC), m.at(DNNL_ARG_DST), 0, 0};
+        return {to_dnnl_algo(algo),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
+                0,
+                0};
    }
 };


--- a/src/targets/cpu/reorder.cpp
+++ b/src/targets/cpu/reorder.cpp
@@ -27,7 +27,7 @@ struct dnnl_reorder : dnnl_op<dnnl_reorder, dnnl::reorder>
    };
    desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
-        return {m.at(DNNL_ARG_SRC), m.at(DNNL_ARG_DST)};
+        return {m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)), m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST))};
    }

    auto get_primitive_desc(const desc& d, const dnnl::primitive_attr& attr) const

--- a/src/targets/cpu/softmax.cpp
+++ b/src/targets/cpu/softmax.cpp
@@ -11,7 +11,7 @@ struct dnnl_softmax : dnnl_extend_op<dnnl_softmax, dnnl::softmax_forward, op::so
    dnnl::softmax_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        int axis = this->op.axis;
-        return {dnnl::prop_kind::forward_inference, m.at(DNNL_ARG_SRC_0), axis};
+        return {dnnl::prop_kind::forward_inference, m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)), axis};
    }
 };


--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -3,7 +3,6 @@
 #include <migraphx/check_context.hpp>
 #include <migraphx/adjust_allocation.hpp>
 #include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/decompose.hpp>
 #include <migraphx/eliminate_allocation.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
 #include <migraphx/eliminate_concat.hpp>
@@ -14,14 +13,16 @@
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/remap.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
+#include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/schedule.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/simplify_algebra.hpp>
+#include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
+#include <migraphx/preallocate_param.hpp>
 #include <migraphx/cpu/fuse_ops.hpp>
 #include <migraphx/cpu/write_literals.hpp>
 #include <migraphx/cpu/allocation_model.hpp>
@@ -45,9 +46,9 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
    unsupported_types.erase(shape::type_t::float_type);
    return {normalize_ops{},
-            eliminate_data_type{unsupported_types, shape::type_t::float_type},
+            rewrite_quantization{},
            dead_code_elimination{},
-            decompose{},
+            eliminate_data_type{unsupported_types, shape::type_t::float_type},
            dead_code_elimination{},
            simplify_reshapes{},
            eliminate_identity{},
@@ -76,6 +77,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            write_literals{},
            dead_code_elimination{},
            memory_coloring{"cpu::allocate"},
+            dead_code_elimination{},
+            preallocate_param{"scratch", cpu_allocation_model{}},
            dead_code_elimination{}};
 }


--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -41,6 +41,7 @@ add_library(migraphx_device
    device/equal.cpp
    device/erf.cpp
    device/exp.cpp
+    device/fill.cpp
    device/floor.cpp
    device/gather.cpp
    device/gelu.cpp
@@ -58,9 +59,12 @@ add_library(migraphx_device
    device/mul.cpp
    device/mul_add.cpp
    device/mul_add_relu.cpp
+    device/multinomial.cpp
+    device/nonzero.cpp
    device/pad.cpp
    device/pow.cpp
    device/prelu.cpp
+    device/prefix_scan_sum.cpp
    device/recip.cpp
    device/reduce_max.cpp
    device/reduce_mean.cpp
@@ -68,9 +72,11 @@ add_library(migraphx_device
    device/reduce_sum.cpp
    device/reduce_prod.cpp
    device/relu.cpp
+    device/reverse.cpp
    device/rnn_variable_seq_lens.cpp
    device/round.cpp
    device/rsqrt.cpp
+    device/scatter.cpp
    device/sigmoid.cpp
    device/sign.cpp
    device/sin.cpp
@@ -81,7 +87,9 @@ add_library(migraphx_device
    device/sub.cpp
    device/tan.cpp
    device/tanh.cpp
+    device/topk.cpp
    device/unary_not.cpp
+    device/where.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
@@ -116,10 +124,12 @@ add_library(migraphx_gpu
    code_object_op.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
+    compile_pointwise.cpp
    concat.cpp
    convert.cpp
    convolution.cpp
    deconvolution.cpp
+    device_name.cpp
    eliminate_workspace.cpp
    elu.cpp
    fuse_ops.cpp
@@ -131,21 +141,26 @@ add_library(migraphx_gpu
    kernel.cpp
    lowering.cpp
    logsoftmax.cpp
+    loop.cpp
    lrn.cpp
    leaky_relu.cpp
    mlir_conv.cpp
+    multinomial.cpp
+    nonzero.cpp
    pack_args.cpp
    pack_int8_args.cpp
    pad.cpp
    pooling.cpp
-    preallocate_param.cpp
    quant_convolution.cpp
+    reverse.cpp
    rnn_variable_seq_lens.cpp
    rocblas.cpp
-    softmax.cpp
+    scatter.cpp
    schedule_model.cpp
+    softmax.cpp
    sync_device.cpp
    target.cpp
+    topk.cpp
    write_literals.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
@@ -184,12 +199,16 @@ register_migraphx_gpu_ops(hip_
    logical_and
    logical_or
    logical_xor
+    loop
    max
    min
    mul
+    multinomial
+    nonzero
    pad
    pow
    prelu
+    prefix_scan_sum
    recip
    reduce_max
    reduce_mean
@@ -197,8 +216,10 @@ register_migraphx_gpu_ops(hip_
    reduce_prod
    reduce_sum
    relu
+    reverse
    round
    rsqrt
+    scatter
    sigmoid
    sign
    sinh
@@ -209,7 +230,9 @@ register_migraphx_gpu_ops(hip_
    sub
    tanh
    tan
+    topk
    unary_not
+    where
 )
 register_migraphx_gpu_ops(miopen_
    abs
@@ -275,19 +298,27 @@ if(MIGRAPHX_ENABLE_MLIR)
    target_link_libraries(migraphx_gpu PUBLIC ${LIBMLIRMIOPEN})
 endif()

+set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "")
+if(MIGRAPHX_USE_HIPRTC)
+target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1)
+else()
 # Get flags needed to compile hip
 include(TargetFlags)
 target_flags(HIP_COMPILER_FLAGS hip::device)
 # Remove cuda arch flags
-string(REGEX REPLACE "--cuda-gpu-arch=[^ \t\r\n]+" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REGEX REPLACE "--offload-arch=[^ \t\r\n]+" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+string(REGEX REPLACE --offload-arch=[a-z0-9:+-]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+string(REPLACE "$<LINK_LANGUAGE:CXX>" "1" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+string(REPLACE "SHELL:" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
 message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
 target_compile_definitions(migraphx_gpu PRIVATE 
    "-DMIGRAPHX_HIP_COMPILER=${CMAKE_CXX_COMPILER}" 
    "-DMIGRAPHX_HIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}" 
    "-DMIGRAPHX_OFFLOADBUNDLER_BIN=${MIGRAPHX_OFFLOADBUNDLER_BIN}"
    "-DMIGRAPHX_EXTRACT_KERNEL=${MIGRAPHX_EXTRACT_KERNEL}"
+    "-DMIGRAPHX_USE_HIPRTC=0"
 )
+endif()

 # Check miopen find mode api
 include(CheckLibraryExists)
@@ -305,6 +336,8 @@ target_compile_definitions(migraphx_gpu PUBLIC -D__HIP_PLATFORM_HCC__=1)
 target_link_libraries(migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas)
 target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)

+add_subdirectory(driver)
+
 rocm_install_targets(
  TARGETS migraphx_gpu migraphx_device
  INCLUDE

--- a/src/targets/gpu/allocation_model.cpp
+++ b/src/targets/gpu/allocation_model.cpp
@@ -11,6 +11,11 @@ operation gpu_allocation_model::allocate(const shape& s) const
    return make_op(name(), {{"shape", to_value(s)}});
 }

+operation gpu_allocation_model::preallocate(const shape& s, const std::string& id) const
+{
+    return make_op("hip::hip_allocate_memory", {{"shape", to_value(s)}, {"id", id}});
+}
+
 std::string gpu_allocation_model::copy() const { return "hip::copy"; }

 } // namespace gpu

--- a/src/targets/gpu/code_object_op.cpp
+++ b/src/targets/gpu/code_object_op.cpp
--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/env.hpp>
+#include <cassert>
+#include <iostream>
+
+#if MIGRAPHX_USE_HIPRTC
+#include <hip/hiprtc.h>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/env.hpp>
+#else
 #include <migraphx/compile_src.hpp>
 #include <migraphx/process.hpp>
-#include <cassert>
+#endif

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
+
+#if MIGRAPHX_USE_HIPRTC
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC)
+
+std::string hiprtc_error(hiprtcResult err, const std::string& msg)
+{
+    return "hiprtc: " + (hiprtcGetErrorString(err) + (": " + msg));
+}
+
+void hiprtc_check_error(hiprtcResult err, const std::string& msg, const std::string& ctx)
+{
+    if(err != HIPRTC_SUCCESS)
+        throw make_exception(ctx, hiprtc_error(err, msg));
+}
+
+#define MIGRAPHX_HIPRTC(...) \
+    hiprtc_check_error(__VA_ARGS__, #__VA_ARGS__, MIGRAPHX_MAKE_SOURCE_CTX())
+
+#define MIGRAPHX_HIPRTC_THROW(error, msg) MIGRAPHX_THROW(hiprtc_error(error, msg))
+
+// Workaround hiprtc's broken API
+void hiprtc_program_destroy(hiprtcProgram prog) { hiprtcDestroyProgram(&prog); }
+using hiprtc_program_ptr = MIGRAPHX_MANAGE_PTR(hiprtcProgram, hiprtc_program_destroy);
+
+template <class... Ts>
+hiprtc_program_ptr hiprtc_program_create(Ts... xs)
+{
+    hiprtcProgram prog = nullptr;
+    auto result        = hiprtcCreateProgram(&prog, xs...);
+    hiprtc_program_ptr p{prog};
+    if(result != HIPRTC_SUCCESS)
+        MIGRAPHX_HIPRTC_THROW(result, "Create program failed.");
+    return p;
+}
+
+struct hiprtc_program
+{
+    struct string_array
+    {
+        std::vector<std::string> strings{};
+        std::vector<const char*> c_strs{};
+
+        string_array() {}
+        string_array(const string_array&) = delete;
+
+        std::size_t size() const { return strings.size(); }
+
+        const char** data() { return c_strs.data(); }
+
+        void push_back(std::string s)
+        {
+            strings.push_back(std::move(s));
+            c_strs.push_back(strings.back().c_str());
+        }
+    };
+
+    hiprtc_program_ptr prog = nullptr;
+    string_array headers{};
+    string_array include_names{};
+    std::string cpp_src  = "";
+    std::string cpp_name = "";
+
+    hiprtc_program(const std::vector<src_file>& srcs)
+    {
+        for(auto&& src : srcs)
+        {
+            std::string content{src.content.first, src.content.second};
+            std::string path = src.path.string();
+            if(src.path.extension().string() == ".cpp")
+            {
+                cpp_src  = std::move(content);
+                cpp_name = std::move(path);
+            }
+            else
+            {
+                headers.push_back(std::move(content));
+                include_names.push_back(std::move(path));
+            }
+        }
+        prog = hiprtc_program_create(cpp_src.c_str(),
+                                     cpp_name.c_str(),
+                                     headers.size(),
+                                     headers.data(),
+                                     include_names.data());
+    }
+
+    void compile(const std::vector<std::string>& options)
+    {
+        if(enabled(MIGRAPHX_TRACE_HIPRTC{}))
+            std::cout << "hiprtc " << join_strings(options, " ") << " " << cpp_name << std::endl;
+        std::vector<const char*> c_options;
+        std::transform(options.begin(),
+                       options.end(),
+                       std::back_inserter(c_options),
+                       [](const std::string& s) { return s.c_str(); });
+        auto result = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
+        std::cerr << log() << std::endl;
+        if(result != HIPRTC_SUCCESS)
+            MIGRAPHX_HIPRTC_THROW(result, "Compilation failed.");
+    }
+
+    std::string log()
+    {
+        std::size_t n = 0;
+        MIGRAPHX_HIPRTC(hiprtcGetProgramLogSize(prog.get(), &n));
+        if(n < 2)
+            return {};
+        std::vector<char> buffer(n);
+        MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
+        assert(buffer.back() == 0);
+        return {buffer.begin(), buffer.end() - 1};
+    }
+
+    std::vector<char> get_code_obj()
+    {
+        std::size_t n = 0;
+        MIGRAPHX_HIPRTC(hiprtcGetCodeSize(prog.get(), &n));
+        std::vector<char> buffer(n);
+        MIGRAPHX_HIPRTC(hiprtcGetCode(prog.get(), buffer.data()));
+        return buffer;
+    }
+};
+
+std::vector<std::vector<char>>
+compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch)
+{
+    hiprtc_program prog(srcs);
+    auto options = split_string(params, ' ');
+    if(enabled(MIGRAPHX_GPU_DEBUG{}))
+        options.push_back("-DMIGRAPHX_DEBUG");
+    if(std::none_of(options.begin(), options.end(), [](const std::string& s) {
+           return starts_with(s, "--std=") or starts_with(s, "-std=");
+       }))
+        options.push_back("-std=c++17");
+    options.push_back("-fno-gpu-rdc");
+    options.push_back(" -O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
+    options.push_back("-Wno-cuda-compat");
+    options.push_back("--cuda-gpu-arch=" + arch);
+    prog.compile(options);
+    return {prog.get_code_obj()};
+}
+
+#else // MIGRAPHX_USE_HIPRTC
+
 bool is_hcc_compiler()
 {
    static const auto result = ends_with(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER), "hcc");
@@ -41,9 +197,12 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    {
        params += " --cuda-gpu-arch=" + arch;
        params += " --cuda-device-only";
-        params += " -O3 ";
+        params += " -O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3") + " ";
    }

+    if(enabled(MIGRAPHX_GPU_DEBUG{}))
+        params += " -DMIGRAPHX_DEBUG";
+
    params += " -Wno-unused-command-line-argument -Wno-cuda-compat ";
    params += MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_FLAGS);

@@ -71,6 +230,8 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    return {compiler.compile(srcs)};
 }

+#endif // MIGRAPHX_USE_HIPRTC
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -2,9 +2,9 @@
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device_name.hpp>
 #include <migraphx/context.hpp>
 #include <migraphx_kernels.hpp>
-#include <migraphx/rank.hpp>
 #include <migraphx/stringutils.hpp>
 #include <hip/hip_runtime_api.h>

@@ -12,36 +12,6 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-template <class HipDeviceProp>
-std::string get_arch_name(rank<0>, const HipDeviceProp& props)
-{
-    return "gfx" + std::to_string(props.gcnArch);
-}
-
-template <class HipDeviceProp>
-auto get_arch_name(rank<1>, const HipDeviceProp& props) -> decltype(std::string(props.gcnArchName))
-{
-    return std::string(props.gcnArchName);
-}
-
-int get_device_id()
-{
-    int device;
-    auto status = hipGetDevice(&device);
-    if(status != hipSuccess)
-        MIGRAPHX_THROW("No device");
-    return device;
-}
-
-std::string get_device_name()
-{
-    hipDeviceProp_t props{};
-    auto status = hipGetDeviceProperties(&props, get_device_id());
-    if(status != hipSuccess)
-        MIGRAPHX_THROW("Failed to get device properties");
-    return get_arch_name(rank<1>{}, props);
-}
-
 template <class T>
 std::string generate_index_ints(const std::vector<T>& v)
 {
@@ -98,6 +68,31 @@ __content__
    return replace_string(args_hpp, "__content__", inner);
 }

+const std::vector<std::string>& compiler_warnings()
+{
+    static std::vector<std::string> warnings = {"-Weverything",
+                                                "-Wno-c++98-compat",
+                                                "-Wno-c++98-compat-pedantic",
+                                                "-Wno-conversion",
+                                                "-Wno-double-promotion",
+                                                "-Wno-exit-time-destructors",
+                                                "-Wno-extra-semi",
+                                                "-Wno-extra-semi-stmt",
+                                                "-Wno-float-conversion",
+                                                "-Wno-gnu-anonymous-struct",
+                                                "-Wno-gnu-zero-variadic-macro-arguments",
+                                                "-Wno-missing-prototypes",
+                                                "-Wno-nested-anon-types",
+                                                "-Wno-padded",
+                                                "-Wno-shorten-64-to-32",
+                                                "-Wno-sign-conversion",
+                                                "-Wno-sign-compare",
+                                                "-Wno-unused-command-line-argument",
+                                                "-Wno-weak-vtables",
+                                                "-Wno-c99-extensions"};
+    return warnings;
+}
+
 operation compile_hip_code_object(const std::string& content, hip_compile_options options)
 {
    std::vector<src_file> srcs;
@@ -112,10 +107,14 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
                   });
    srcs.push_back(src_file{fs::path{"main.cpp"},
                            std::make_pair(content.data(), content.data() + content.size())});
-    auto args_hpp = generate_args_hpp(options.inputs);
+    auto args_hpp =
+        generate_args_hpp(options.reduced_inputs.empty() ? options.inputs : options.reduced_inputs);
    srcs.push_back(src_file{fs::path{"args.hpp"},
                            std::make_pair(args_hpp.data(), args_hpp.data() + args_hpp.size())});
-    options.params += " -I.";
+    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
+    options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
+    options.params += " " + join_strings(compiler_warnings(), " ");
+    options.params += " -Werror";
    auto cos = compile_hip_src(srcs, std::move(options.params), get_device_name());
    if(cos.size() != 1)
        MIGRAPHX_THROW("No code object");

--- a/src/targets/gpu/compile_pointwise.cpp
+++ b/src/targets/gpu/compile_pointwise.cpp
+#include <migraphx/gpu/compile_pointwise.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+static const char* const pointwise_kernel = R"__migraphx__(
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <args.hpp>
+
+using namespace migraphx;
+
+extern "C" {
+__global__ void kernel(${params}) 
+{
+    pointwise(${lambda}, ${args});
+}
+    
+}
+
+int main() {}
+
+)__migraphx__";
+
+std::string enum_params(std::size_t count, std::string param)
+{
+    std::vector<std::string> items(count);
+    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
+    return join_strings(items, ",");
+}
+
+std::size_t compute_global(std::size_t n, std::size_t local = 1024)
+{
+    std::size_t groups  = (n + local - 1) / local;
+    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
+    return nglobal;
+}
+
+operation compile_pointwise(context&, const std::vector<shape>& inputs, const std::string& lambda)
+{
+    hip_compile_options options;
+    options.global         = compute_global(inputs.front().elements());
+    options.local          = 1024;
+    options.inputs         = inputs;
+    options.output         = inputs.back();
+    options.reduced_inputs = reduce_dims(inputs);
+    auto src               = interpolate_string(pointwise_kernel,
+                                  {{"params", enum_params(inputs.size(), "void * private_p")},
+                                   {"args", enum_params(inputs.size(), "private_p")},
+                                   {"lambda", lambda}});
+    return compile_hip_code_object(src, options);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/convolution.cpp
+++ b/src/targets/gpu/convolution.cpp
@@ -11,7 +11,7 @@ shape miopen_convolution::compute_shape(const std::vector<shape>& inputs) const
    check_shapes{inputs, *this}.has(4).standard();
    std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
    check_shapes{conv_inputs, *this}.max_ndims(5);
-    return op.compute_shape(conv_inputs);
+    return op.normalize_compute_shape(conv_inputs);
 }

 inline shape reshape_if_1d(const shape& input)

--- a/src/targets/gpu/device/fill.cpp
+++ b/src/targets/gpu/device/fill.cpp
+#include <migraphx/gpu/device/fill.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void fill(hipStream_t stream, const argument& result, unsigned long val)
+{
+    nary(stream, result)([=]() __device__ { return val; });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+
+#include <migraphx/requires.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class... Ts>
+using common_type = typename std::common_type<Ts...>::type;
+
+template <class T, MIGRAPHX_REQUIRES(is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return std::isfinite(x) and std::isfinite(y) and
+           std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
+           std::nextafter(x, std::numeric_limits<T>::max()) >= y;
+}
+
+template <class T, MIGRAPHX_REQUIRES(not is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return x == y;
+}
+
+template <class T, class U>
+__device__ bool float_equal(T x, U y)
+{
+    return float_equal_device<common_type<T, U>>(x, y);
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
@@ -352,7 +352,8 @@ bool broadcastable(bool& divisible_by_4,
        auto b_len          = result.get_shape().lens()[b_idx];
        auto b_stride       = result.get_shape().strides()[b_idx];
        assert(bshape.lens()[b_idx] == b_len);
-        if(b_len <= max_size and std::none_of(std::next(b_it), strides.end(), not_zero))
+        if(b_len <= max_size and std::none_of(std::next(b_it), strides.end(), not_zero) and
+           is_divisor_encodable(b_stride * b_len))
        {

            divisible_by_4 = (b_len % 4 == 0) and (b_stride % 4 == 0) and