Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into develop

26d1a969 · Alan Turner · f5ebc8f5 · 065d06af · 26d1a969 · 26d1a969
Commit 26d1a969 authored Aug 10, 2023 by Alan Turner
20 changed files
--- a/src/rewrite_quantization.cpp
+++ b/src/rewrite_quantization.cpp
@@ -28,6 +28,7 @@
 #include <migraphx/tune_axis.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/shape.hpp>
+#include <migraphx/common.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -61,13 +62,10 @@ void apply_quantizelinear(module& m, instruction_ref ins)
        max_quant = qt.max();
        min_quant = qt.min();
    });
-    auto s = add_zero_point->get_shape();
+    auto s        = add_zero_point->get_shape();
-    std::vector<int> min_data(s.elements(), min_quant);
+    auto min_arg  = m.add_literal(literal{shape{s.type()}, {min_quant}});
-    std::vector<int> max_data(s.elements(), max_quant);
+    auto max_arg  = m.add_literal(literal{shape{s.type()}, {max_quant}});
-    auto min_arg = m.add_literal(literal(s, min_data));
+    auto saturate = insert_common_op(m, ins, make_op("clip"), {add_zero_point, min_arg, max_arg});
-    auto max_arg = m.add_literal(literal(s, max_data));
-    auto saturate = m.insert_instruction(ins, make_op("clip"), add_zero_point, min_arg, max_arg);
    m.replace_instruction(
        ins, make_op("convert", {{"target_type", ins->get_shape().type()}}), saturate);
 }

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -1095,8 +1095,9 @@ MIGRAPHX_PRED_MATCHER(horiz_conv_dot, instruction_ref ins)
        };
    };
    auto dots  = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("dot"));
+    auto qdots = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("quant_dot"));
    auto convs = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("convolution"));
-    return (dots >= 2 or convs >= 2);
+    return (dots >= 2 or convs >= 2 or qdots >= 2);
 }
 struct find_conv_dot_horiz_fusion
@@ -1110,7 +1111,7 @@ struct find_conv_dot_horiz_fusion
        auto pred = [](auto i, auto j) {
            if(i->get_operator() != j->get_operator())
                return false;
-            if(not contains({"dot", "convolution"}, i->name()))
+            if(not contains({"quant_dot", "dot", "convolution"}, i->name()))
                return true;
            auto x = i->inputs()[1]->get_shape().lens();
            auto y = j->inputs()[1]->get_shape().lens();
@@ -1118,7 +1119,7 @@ struct find_conv_dot_horiz_fusion
                return false;
            // Check that non-axes match
            int axis = 1;
-            if(i->name() == "dot")
+            if(i->name() == "dot" or i->name() == "quant_dot")
            {
                axis = x.size() - 1;
            }
@@ -1129,7 +1130,7 @@ struct find_conv_dot_horiz_fusion
            if(std::distance(start, last) < 2)
                return;
            auto&& name = (*start)->name();
-            if(not contains({"dot", "convolution"}, name))
+            if(not contains({"quant_dot", "dot", "convolution"}, name))
                return;
            auto op   = (*start)->get_operator();
            int group = 1;
@@ -1144,7 +1145,7 @@ struct find_conv_dot_horiz_fusion
                start, last, std::back_inserter(args), [&](auto x) { return x->inputs().at(1); });
            int axis        = 1;
            int concat_axis = 0;
-            if(name == "dot")
+            if(name == "dot" or name == "quant_dot")
            {
                axis        = int(args.front()->get_shape().lens().size() - 1);
                concat_axis = axis;

--- a/src/sqlite.cpp
+++ b/src/sqlite.cpp
@@ -48,6 +48,7 @@ struct sqlite_impl
    template <class F>
    void exec(const char* sql, F f)
    {
+        // cppcheck-suppress constParameterPointer
        auto callback = [](void* obj, auto... xs) -> int {
            try
            {

--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
@@ -43,7 +43,11 @@ struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot>
                MIGRAPHX_DNNL_PREFIX(ARG_BIAS)};
    }
-    void required(const check_shapes& cs) const { cs.not_broadcasted(); }
+    template <class T>
+    void required(const check_shapes<T>& cs) const
+    {
+        cs.not_broadcasted();
+    }
    dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {

--- a/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/dnnl.hpp
@@ -400,7 +400,11 @@ struct dnnl_extend_op : dnnl_op<Derived, Primitive>
    }
    // dnnl has some issues with non-packed inputs
-    void required(const check_shapes& cs) const { cs.packed_or_broadcasted(); }
+    template <class T>
+    void required(const check_shapes<T>& cs) const
+    {
+        cs.packed_or_broadcasted();
+    }
    std::string name() const { return "dnnl::" + op.name(); }
    shape compute_shape(std::vector<shape> inputs) const

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -61,7 +61,7 @@ namespace cpu {
 std::string target::name() const { return "cpu"; }
-// cppcheck-suppress constParameter
+// cppcheck-suppress constParameterReference
 std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options&) const
 {
    auto& ctx = any_cast<context>(gctx);

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -48,7 +48,7 @@ include(Embed)
 file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
-add_embed_library(migraphx_kernels ${KERNEL_FILES})
+add_embed_library(migraphx_kernels ${KERNEL_FILES} RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/)
 file(GLOB DEVICE_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
 add_library(migraphx_device ${DEVICE_GPU_SRCS})

--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -331,7 +331,7 @@ static std::vector<std::string> get_op_names(const module& m)
    {
        if(starts_with(ins.name(), "@"))
            continue;
-        if(ins.name() == "multibroadcast")
+        if(contains({"multibroadcast", "contiguous"}, ins.name()))
            continue;
        if(ins.name() == "pointwise")
        {

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -167,7 +167,7 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
                   [](auto&& p) {
                       auto&& name = p.first;
                       auto&& c    = p.second;
-                       auto path   = fs::path{"migraphx"} / "kernels" / name;
+                       auto path   = name;
                       return src_file{path, c};
                   });
    srcs.push_back(src_file{fs::path{"main.cpp"},

--- a/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
@@ -124,7 +124,7 @@ void nary_broadcast_vec_impl(
                    buffer[i] = binput.data()[i];
                }
                __syncthreads();
-                auto* bp = as_pointer(buffer);
+                const auto* bp = as_pointer(buffer);
                // Process the data
                for(size_t i = idx.global; i < nelements; i += nglobal)
                {
@@ -219,7 +219,7 @@ void nary_double_broadcast_vec_impl(
                    buffer[i + bdim_vec_len] = binput2.data()[i];
                }
                __syncthreads();
-                auto* bp = as_pointer(buffer);
+                const auto* bp = as_pointer(buffer);
                // Process the data
                for(size_t i = idx.global; i < nelements; i += nglobal)
                {

--- a/src/targets/gpu/device/topk.cpp
+++ b/src/targets/gpu/device/topk.cpp
@@ -72,12 +72,12 @@ struct hip_heap_vector
            index_int l    = 2 * index + 1;
            index_int r    = 2 * index + 2;
-            if(l < n && compare(data[data_index(l)], data[data_index(index)]))
+            if(l < n and compare(data[data_index(l)], data[data_index(index)]))
            {
                index = l;
            }
-            if(r < n && compare(data[data_index(r)], data[data_index(index)]))
+            if(r < n and compare(data[data_index(r)], data[data_index(index)]))
            {
                index = r;
                if(compare(data[data_index(l)], data[data_index(r)]))

--- a/src/targets/gpu/device_name.cpp
+++ b/src/targets/gpu/device_name.cpp
@@ -31,20 +31,6 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-template <class HipDeviceProp>
-std::string get_arch_name(rank<0>, const HipDeviceProp& props)
-{
-    return "gfx" + std::to_string(props.gcnArch);
-}
-template <class HipDeviceProp>
-auto get_arch_name(rank<1>, const HipDeviceProp& props) -> decltype(std::string(props.gcnArchName))
-{
-    return std::string(props.gcnArchName);
-}
-std::string get_arch_name(const hipDeviceProp_t& props) { return get_arch_name(rank<1>{}, props); }
 int get_device_id()
 {
    int device;
@@ -60,7 +46,7 @@ std::string get_device_name()
    auto status = hipGetDeviceProperties(&props, get_device_id());
    if(status != hipSuccess)
        MIGRAPHX_THROW("Failed to get device properties");
-    return get_arch_name(props);
+    return props.gcnArchName;
 }
 } // namespace gpu

--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -239,13 +239,13 @@ struct find_mlir_op
        bool is_float = contains({type_t::float_type, type_t::half_type}, result_type);
        if(contains(any_type_ops, name))
            return true;
-        if(result_type != type_t::bool_type && contains(no_bool_ops, name))
+        if(result_type != type_t::bool_type and contains(no_bool_ops, name))
            return true;
-        if(is_float && contains(fp_only_ops, name))
+        if(is_float and contains(fp_only_ops, name))
            return true;
        // Only conversions between floating types are known to be unambigiously
        // supported.
-        if(is_float && name == "convert")
+        if(is_float and name == "convert")
        {
            return std::all_of(i.inputs().begin(), i.inputs().end(), [](const auto& arg) {
                return contains({type_t::float_type, type_t::half_type}, arg->get_shape().type());

--- a/src/targets/gpu/include/migraphx/gpu/compiler.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compiler.hpp
@@ -32,6 +32,7 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/optional.hpp>
 #include <migraphx/rank.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
 #include <functional>
 namespace migraphx {
@@ -68,12 +69,6 @@ struct compiler_replace
    }
 };
-struct tuning_config
-{
-    value problem;
-    std::vector<value> solutions;
-};
 using compiler_compile =
    std::function<compiler_replace(context&, instruction_ref, operation, const value&)>;
 using compiler_compile_op =

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -46,13 +46,7 @@ using hip_event_ptr = MIGRAPHX_MANAGE_PTR(hipEvent_t, hipEventDestroy);
 struct hip_device
 {
-    hip_device()
+    hip_device() : device_props{} { add_stream(); }
-    {
-        device_props.gcnArchName[0]      = '\0';
-        device_props.gcnArch             = 0;
-        device_props.multiProcessorCount = 0;
-        add_stream();
-    }
    hip_device(std::size_t id, std::size_t n) : device_id(id)
    {
@@ -171,7 +165,7 @@ struct hip_device
    std::size_t stream_id() const { return current_stream; }
-    std::string get_device_name() const { return get_arch_name(device_props); }
+    std::string get_device_name() const { return device_props.gcnArchName; }
    std::string get_gfx_name() const { return trim(split_string(get_device_name(), ':').front()); }

--- a/src/targets/gpu/include/migraphx/gpu/device_name.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device_name.hpp
@@ -33,8 +33,6 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-MIGRAPHX_GPU_EXPORT std::string get_arch_name(const hipDeviceProp_t& props);
 MIGRAPHX_GPU_EXPORT std::string get_device_name();
 MIGRAPHX_GPU_EXPORT int get_device_id();

--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -92,7 +92,7 @@ struct hip_sync_stream
        return inputs.front();
    }
-    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    argument compute(const context& ctx, const shape&, const std::vector<argument>& args) const
    {
        gpu_sync(ctx);
        if(args.empty())

--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -29,6 +29,7 @@
 #include <migraphx/gpu/config.hpp>
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/instruction_ref.hpp>
+#include <migraphx/gpu/tuning_config.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -36,16 +37,20 @@ struct module;
 namespace gpu {
 MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m);
+MIGRAPHX_GPU_EXPORT code_object_op compile_mlir(const context& migraphx_ctx,
-MIGRAPHX_GPU_EXPORT code_object_op compile_mlir(const context& ctx,
                                                module m,
-                                                const std::vector<instruction_ref>& inputs);
+                                                const std::vector<instruction_ref>& inputs,
+                                                const value& solution);
 MIGRAPHX_GPU_EXPORT instruction_ref insert_mlir(module& m,
                                                instruction_ref ins,
                                                code_object_op co,
                                                const std::vector<instruction_ref>& inputs);
+MIGRAPHX_GPU_EXPORT tuning_config get_tuning_config_mlir(const context& migraphx_ctx,
+                                                         module m,
+                                                         const std::vector<shape>& inputs);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/tuning_config.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/tuning_config.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
+#define MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct tuning_config
+{
+    value problem;
+    std::vector<value> solutions;
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_TUNING_CONFIG_HPP
--- a/src/targets/gpu/jit/ck_gemm.cpp
+++ b/src/targets/gpu/jit/ck_gemm.cpp
@@ -300,7 +300,8 @@ struct ck_gemm_compiler : compiler<ck_gemm_compiler>
        const auto& b_shape = inputs[1];
        const auto& c_shape = inputs.back();
-        auto rank = a_shape.lens().size();
+        // cppcheck-suppress unreadVariable
+        auto rank = a_shape.ndim();
        auto batch_count = get_batch_count(c_shape);
        auto m           = c_shape.lens()[rank - 2];