Merge branch 'develop' into sqlite3_windows

514bd678 · Artur Wojcik · GitHub · bc7adab1 · f8bf7bd3 · 514bd678
Unverified Commit 514bd678 authored Oct 14, 2023 by Artur Wojcik Committed by GitHub Oct 14, 2023
20 changed files
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,5 +28,5 @@ ROCmSoftwarePlatform/half@rocm-5.6.0
 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/composable_kernel@a22e479b8e1557961039db2d5c5ff89cff35e86b -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/rocMLIR@12748a3402c069f733ea7f2ba1f8d8a070b3622a -DBUILD_FAT_LIBROCKCOMPILER=On
+ROCmSoftwarePlatform/composable_kernel@70eefcf4f263aa5c25f3c9ff0db8f6f199ef0fb9 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
+ROCmSoftwarePlatform/rocMLIR@507bb94ce7873786486d296ec81d2eadaab49003 -DBUILD_FAT_LIBROCKCOMPILER=On
\ No newline at end of file
--- a/src/onnx/padding.cpp
+++ b/src/onnx/padding.cpp
@@ -47,7 +47,7 @@ void cal_auto_padding_size(onnx_parser::node_info info,
        return;
    }

-    auto auto_pad = info.attributes["auto_pad"].s();
+    auto auto_pad = to_upper(info.attributes["auto_pad"].s());
    if(auto_pad.find("SAME") != std::string::npos)
    {
        bool is_same_upper = (auto_pad.find("SAME_UPPER") != std::string::npos);

--- a/src/onnx/parse_pooling.cpp
+++ b/src/onnx/parse_pooling.cpp
@@ -97,7 +97,7 @@ struct parse_pooling : op_parser<parse_pooling>
            values["lp_order"] = info.attributes.at("p").i();
        }

-        // ensure pads availabe only when auto_pad is "NOT_SET"
+        // ensure pads available only when auto_pad is "NOT_SET"
        check_padding_mode(info, "POOLING");

        return values;

--- a/src/onnx/parse_qlinearconv.cpp
+++ b/src/onnx/parse_qlinearconv.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/padding.hpp>
+#include <migraphx/onnx/conv.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/onnx/broadcast_qdq.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+/*
+ *********************************************************************************
+ *  Reference: see QLinearConv in                                                *
+ *  https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md  *
+ *********************************************************************************
+
+com.microsoft.QLinearConv
+
+Version
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+ATTRIBUTES:
+auto_pad : string
+channels_last : int
+dilations : list of ints
+group : int
+kernel_shape : list of ints
+pads : list of ints
+strides : list of ints
+
+INPUTS (8 - 9):
+x : T1
+x_scale : tensor(float)
+x_zero_point : T1
+w : T2
+w_scale : tensor(float)
+w_zero_point : T2
+y_scale : tensor(float)
+y_zero_point : T3
+B (optional) : T4
+
+OUTPUTS:
+y : T3
+
+Type Constraints:
+T1 : tensor(int8), tensor(uint8)
+T2 : tensor(int8), tensor(uint8)
+T3 : tensor(int8), tensor(uint8)
+T4 : tensor(int32)
+
+More details also at:
+https://xadupre.github.io/draft/onnx/onnx_doc_folder/onnx__QLinearConv.html
+
+*/
+
+struct parse_qlinearconv : op_parser<parse_qlinearconv>
+{
+    std::vector<op_desc> operators() const { return {{"QLinearConv"}}; }
+
+    // basic type checking for QLinearConv Operator
+    void check_inputs(const std::vector<instruction_ref>& inp_arg) const
+    {
+        if(inp_arg.size() < 8)
+            MIGRAPHX_THROW("QLINEARCONV: missing inputs");
+
+        const instruction_ref& in_x       = inp_arg[0];
+        const instruction_ref& in_scale_x = inp_arg[1];
+        const instruction_ref& in_w       = inp_arg[3];
+        const instruction_ref& in_scale_w = inp_arg[4];
+        const instruction_ref& in_scale_y = inp_arg[6];
+
+        auto sh_x   = in_x->get_shape();
+        auto sh_w   = in_w->get_shape();
+        auto type_x = sh_x.type();
+        auto type_w = sh_w.type();
+
+        assert(in_x->get_shape().ndim() > 2);
+
+        if(type_x != shape::int8_type and type_x != shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARCONV: unsupported input type");
+        if(type_w != shape::int8_type and type_w != shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARCONV: unsupported weight type");
+        if(in_scale_x->get_shape().type() != shape::float_type)
+            MIGRAPHX_THROW("QLINEARCONV x scale type should be float");
+        if(in_scale_w->get_shape().type() != shape::float_type)
+            MIGRAPHX_THROW("QLINEARCONV: wt scale type should be float");
+        if(in_scale_y->get_shape().type() != shape::float_type)
+            MIGRAPHX_THROW("QLINEARCONV: y scale type should be float");
+        if(inp_arg.size() > 8 and inp_arg[8]->get_shape().type() != shape::int32_type)
+            MIGRAPHX_THROW("QLINEARCONV y bias should be int32");
+    }
+
+    // process all attributes of QLinearConv Operator..
+    value process_attributes(const onnx_parser& parser,
+                             const onnx_parser::node_info& info,
+                             const std::vector<instruction_ref>& args) const
+    {
+        value values;
+
+        const auto& in_x = args[0];
+        const auto& wt   = args[3];
+
+        size_t kdims = in_x->get_shape().ndim() - 2;
+
+        check_padding_mode(info, "QLINEARCONV");
+
+        values["stride"]   = std::vector<int>(kdims, 1);
+        values["dilation"] = std::vector<int>(kdims, 1);
+        values["padding"]  = std::vector<int>(kdims, 0);
+        values["group"]    = 1;
+
+        if(contains(info.attributes, "group"))
+            values["group"] = parser.parse_value(info.attributes.at("group")).template at<int>();
+
+        if(contains(info.attributes, "strides"))
+        {
+            std::vector<int> st;
+            copy(info.attributes.at("strides").ints(), std::back_inserter(st));
+            check_attr_sizes(kdims, st.size(), "QLINEARCONV: inconsistent strides");
+            values["stride"] = st;
+        }
+
+        if(contains(info.attributes, "dilations"))
+        {
+            std::vector<int> dil;
+            copy(info.attributes.at("dilations").ints(), std::back_inserter(dil));
+            check_attr_sizes(kdims, dil.size(), "QLINEARCONV: inconsistent dilations");
+            values["dilation"] = dil;
+        }
+
+        if(contains(info.attributes, "pads"))
+        {
+            std::vector<int> pads;
+            copy(info.attributes.at("pads").ints(), std::back_inserter(pads));
+            check_attr_sizes(kdims, pads.size() / 2, "QLINEARCONV: inconsistent padding");
+            values["padding"] = pads;
+        }
+        else if(contains(info.attributes, "auto_pad"))
+        {
+            auto in_lens = in_x->get_shape().lens();
+            auto wt_lens = wt->get_shape().lens();
+            std::vector<std::size_t> k_lens(wt_lens.begin() + 2, wt_lens.end());
+            std::vector<int64_t> pads = values["padding"].to_vector<std::int64_t>();
+            cal_auto_padding_size(
+                info, values, k_lens, values["dilation"].to_vector<std::size_t>(), in_lens, pads);
+            values["padding"] = pads;
+        }
+
+        recalc_conv_attributes(values, kdims);
+
+        return values;
+    }
+
+    instruction_ref add_bias_to_conv(const instruction_ref bias_arg,
+                                     const instruction_ref conv_instr,
+                                     const onnx_parser::node_info& info) const
+    {
+        auto conv_sh   = conv_instr->get_shape();
+        auto conv_lens = conv_sh.lens();
+        auto conv_type = conv_sh.type();
+
+        auto broadcast_bias = info.add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", conv_lens}}), bias_arg);
+        auto f_bias =
+            info.add_instruction(make_op("convert", {{"target_type", conv_type}}), broadcast_bias);
+
+        return info.add_instruction(migraphx::make_op("add"), conv_instr, f_bias);
+    };
+
+    instruction_ref parse(const op_desc& /* opd */,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        check_inputs(args);
+
+        auto values = process_attributes(parser, info, args);
+
+        // input: quantized x, scale, zero_pt
+        const instruction_ref& in_x         = args[0];
+        const instruction_ref& in_scale_x   = args[1];
+        const instruction_ref& in_zero_pt_x = args[2];
+
+        // input: quantized weights, scale, zero_pt
+        const instruction_ref& in_w         = args[3];
+        const instruction_ref& in_scale_w   = args[4];
+        const instruction_ref& in_zero_pt_w = args[5];
+
+        // for the dequantized output  y: scale & zero_pt
+        const instruction_ref& in_scale_y   = args[6];
+        const instruction_ref& in_zero_pt_y = args[7];
+
+        auto dquant_x = bcast_qdq_instr("dequantizelinear", in_x, in_scale_x, in_zero_pt_x, info);
+
+        auto dquant_w = bcast_qdq_instr("dequantizelinear", in_w, in_scale_w, in_zero_pt_w, info);
+
+        auto conv_op = migraphx::make_op("convolution", values);
+
+        auto conv_x_w = info.add_instruction(conv_op, dquant_x, dquant_w);
+
+        // Biases, if any.. : is an optional argument.
+        if(args.size() > 8)
+            conv_x_w = add_bias_to_conv(args[8], conv_x_w, info);
+
+        auto quant_conv =
+            bcast_qdq_instr("quantizelinear", conv_x_w, in_scale_y, in_zero_pt_y, info);
+        return quant_conv;
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_qlinearglavgpool.cpp
+++ b/src/onnx/parse_qlinearglavgpool.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/onnx/broadcast_qdq.hpp>
+#include <migraphx/instruction.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+/*
+ *********************************************************************************
+ *  Reference: see QLinearGlobalAveragePool in                                   *
+ *  github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md          *
+ *********************************************************************************
+
+QLinearGlobalAveragePool consumes an input tensor X and applies
+Average pooling across the values in the same channel. This is
+equivalent to AveragePool with kernel size equal to the spatial
+dimension of input tensor. Input is of type uint8_t or int8_t.
+
+Version
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+Attributes
+channels_last : int
+
+Inputs
+X : T
+
+Input data tensor from the previous operator; According to channels_last, dimensions for image case
+are (N x C x H x W), or (N x H x W x C) where N is the batch size, C is the number of channels, and
+H and W are the height and the width of the data. For non image case, the dimensions are in the form
+of (N x C x D1 x D2 ... Dn), or (N x D1 X D2 ... Dn x C) where N is the batch size.
+
+x_scale : tensor(float)
+Scale of quantized input 'X'. It must be a scalar.
+
+x_zero_point : T
+Zero point tensor for input 'X'. It must be a scalar.
+
+y_scale : tensor(float)
+Scale of quantized output 'Y'. It must be a scalar.
+
+y_zero_point : T
+Zero point tensor for output 'Y'. It must be a scalar.
+
+Outputs
+Y : T
+Output data tensor from pooling across the input tensor. The output tensor has the same rank as the
+input. with the N and C value keep it value, while the other dimensions are all 1. Type Constraints
+T : tensor(uint8), tensor(int8)
+Constrain input and output types to signed/unsigned int8 tensors.
+
+*/
+
+struct parse_qlinearglobalaveragepool : op_parser<parse_qlinearglobalaveragepool>
+{
+    std::vector<op_desc> operators() const { return {{"QLinearGlobalAveragePool"}}; }
+
+    // basic type checking for QLinearGlobalAveragePool Operator
+    void check_inputs(const std::vector<instruction_ref>& args) const
+    {
+        if(args.size() < 5)
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: missing inputs");
+
+        const auto& in_x      = args[0];
+        const auto& zero_pt_x = args[2];
+        const auto& zero_pt_y = args[4];
+
+        if(in_x->get_shape().ndim() <= 2)
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: input dimensions too small");
+
+        auto type_x = in_x->get_shape().type();
+        if(type_x != migraphx::shape::int8_type and type_x != migraphx::shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: unsupported input type");
+
+        if(type_x != zero_pt_x->get_shape().type())
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: mismatched type: input zero point");
+
+        if(type_x != zero_pt_y->get_shape().type())
+            MIGRAPHX_THROW("QLINEARGLOBALAVERAGEPOOL: mismatched type: output zero point");
+    }
+
+    instruction_ref parse(const op_desc& /* opd */,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        int channels_last =
+            parser.parse_value(info.attributes.at("channels_last")).template at<int>();
+        if(channels_last != 0)
+            MIGRAPHX_THROW(
+                "QLINEARGLOBALAVERAGEPOOL: channels_last (N x D1..Dn x C) is not supported");
+
+        check_inputs(args);
+
+        // Input: X
+
+        const auto& in_x      = args[0];
+        const auto& scale_x   = args[1];
+        const auto& zero_pt_x = args[2];
+        auto dquant_x         = bcast_qdq_instr("dequantizelinear", in_x, scale_x, zero_pt_x, info);
+
+        // Output Y = globalaveragepool(X)
+
+        auto op   = migraphx::op::pooling{migraphx::op::pooling_mode::average};
+        auto lens = in_x->get_shape().lens();
+        std::vector<size_t> lengths(lens.begin() + 2, lens.end());
+        op.lengths = lengths;
+        op.padding = std::vector<size_t>(lens.size());
+        auto out_y = info.add_instruction(op, dquant_x);
+
+        const auto& scale_y   = args[3];
+        const auto& zero_pt_y = args[4];
+
+        auto out_quant_y = bcast_qdq_instr("quantizelinear", out_y, scale_y, zero_pt_y, info);
+
+        return out_quant_y;
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_qlinearmatmul.cpp
+++ b/src/onnx/parse_qlinearmatmul.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/onnx/broadcast_qdq.hpp>
+#include <migraphx/instruction.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+/*
+ *********************************************************************************
+ *  Reference: see QLinearMatMul in                                              *
+ *  https://onnx.ai/onnx/operators/onnx__QLinearMatMul.html                      *
+ *********************************************************************************
+
+Matrix product that behaves like numpy.matmul:
+
+https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html. It consumes two
+quantized input tensors, their scales and zero points, scale and zero point of output, and computes
+the quantized output. The quantization formula is y = saturate((x / y_scale) + y_zero_point). For (x
+/ y_scale), it is rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding
+for details. Scale and zero point must have same shape. They must be either scalar (per tensor) or
+N-D tensor (per row for ‘a’ and per column for ‘b’). Scalar refers to per tensor quantization
+whereas N-D refers to per row or per column quantization. If the input is 2D of shape [M, K] then
+zero point and scale tensor may be an M element vector [v_1, v_2, …, v_M] for per row quantization
+and K element vector of shape [v_1, v_2, …, v_K] for per column quantization. If the input is N-D
+tensor with shape [D1, D2, M, K] then zero point and scale tensor may have shape [D1, D2, M, 1] for
+per row quantization and shape [D1, D2, 1, K] for per column quantization. Production must never
+overflow, and accumulation may overflow if and only if in 32 bits.
+
+Inputs
+a (heterogeneous) - T1: N-dimensional quantized matrix a
+
+a_scale (heterogeneous) - tensor(float): scale of quantized input a
+
+a_zero_point (heterogeneous) - T1: zero point of quantized input a
+
+b (heterogeneous) - T2: N-dimensional quantized matrix b
+
+b_scale (heterogeneous) - tensor(float): scale of quantized input b
+
+b_zero_point (heterogeneous) - T2: zero point of quantized input b
+
+y_scale (heterogeneous) - tensor(float): scale of quantized output y
+
+y_zero_point (heterogeneous) - T3: zero point of quantized output y
+
+Outputs
+y (heterogeneous) - T3: Quantized matrix multiply results from a * b
+
+Type Constraints
+T1 in ( tensor(int8), tensor(uint8) ): Constrain input a and its zero point data type to 8-bit
+integer tensor.
+
+T2 in ( tensor(int8), tensor(uint8) ): Constrain input b and its zero point data type to 8-bit
+integer tensor.
+
+T3 in ( tensor(int8), tensor(uint8) ): Constrain output y and its zero point data type to 8-bit
+integer tensor.
+
+*/
+
+struct parse_qlinearmatmul : op_parser<parse_qlinearmatmul>
+{
+    std::vector<op_desc> operators() const { return {{"QLinearMatMul"}}; }
+
+    // basic type checking for QLinearMatMul Operator
+
+    void check_inputs(const std::vector<instruction_ref>& args) const
+    {
+        if(args.size() < 8)
+            MIGRAPHX_THROW("QLINEARMATMUL: missing inputs");
+
+        const auto& in_a = args[0];
+        const auto& in_b = args[3];
+
+        auto sh_a = in_a->get_shape();
+        auto sh_b = in_b->get_shape();
+
+        auto type_a = sh_a.type();
+        auto type_b = sh_b.type();
+        if(type_a != migraphx::shape::int8_type and type_a != migraphx::shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARMATMUL: unsupported input type");
+        if(type_b != migraphx::shape::int8_type and type_b != migraphx::shape::uint8_type)
+            MIGRAPHX_THROW("QLINEARMATMUL: unsupported input type");
+
+        auto lens_a = sh_a.lens();
+        auto lens_b = sh_b.lens();
+
+        size_t dim_a = lens_a.size();
+        size_t dim_b = lens_b.size();
+
+        if(dim_a == 0 or dim_b == 0)
+            MIGRAPHX_THROW("QLINEARMATMUL: empty input");
+
+        // broadcast supported if either is 1-D -- the other can be a 2-D tensor.
+        // if it is 1-D, just prepend/append that lens and check further constraints..
+        if(dim_a == 1)
+        {
+            lens_a.insert(lens_a.begin(), 1);
+            dim_a++;
+        }
+        if(dim_b == 1)
+        {
+            lens_b.push_back(1);
+            dim_b++;
+        }
+
+        // 2-D or higher-order mat mul
+        if(dim_a != dim_b or *lens_a.rbegin() != *(lens_b.rbegin() + 1) or
+           not std::equal(lens_a.rbegin() + 2, lens_a.rend(), lens_b.rbegin() + 2, lens_b.rend()))
+            MIGRAPHX_THROW("QLINEARMATMUL: mismatched input dimensions");
+
+        if(migraphx::any_of({args[1], args[2], args[4], args[5]},
+                            [](auto arg) { return not arg->get_shape().scalar(); }))
+            MIGRAPHX_THROW("QLINEARMATMUL: unsupported row/column quantization");
+    }
+
+    instruction_ref parse(const op_desc& /* opd */,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        check_inputs(args);
+
+        // A
+        const auto& in_a         = args[0];
+        const auto& in_scale_a   = args[1];
+        const auto& in_zero_pt_a = args[2];
+        auto dquant_a = bcast_qdq_instr("dequantizelinear", in_a, in_scale_a, in_zero_pt_a, info);
+
+        // B
+        const auto& in_b         = args[3];
+        const auto& in_scale_b   = args[4];
+        const auto& in_zero_pt_b = args[5];
+        auto dquant_b = bcast_qdq_instr("dequantizelinear", in_b, in_scale_b, in_zero_pt_b, info);
+
+        bool is_a_prepended = false;
+        bool is_b_appended  = false;
+
+        // un-squeeze either tensor if 1-D.
+        if(in_a->get_shape().ndim() == 1)
+        {
+            is_a_prepended = true;
+            dquant_a       = info.add_instruction(make_op("unsqueeze", {{"axes", {0}}}), dquant_a);
+        }
+        if(in_b->get_shape().ndim() == 1)
+        {
+            is_b_appended = true;
+            dquant_b      = info.add_instruction(make_op("unsqueeze", {{"axes", {1}}}), dquant_b);
+        }
+
+        // Y = A * B
+        auto out_y = info.add_instruction(migraphx::make_op("dot"), dquant_a, dquant_b);
+
+        // squeeze just once if necessary.. not twice.
+        if(is_a_prepended)
+            out_y = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), out_y);
+        else if(is_b_appended)
+            out_y = info.add_instruction(make_op("squeeze", {{"axes", {1}}}), out_y);
+
+        const auto& scale_y   = args[6];
+        const auto& zero_pt_y = args[7];
+
+        return bcast_qdq_instr("quantizelinear", out_y, scale_y, zero_pt_y, info);
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -521,6 +521,27 @@ struct find_inner_broadcast
                      }) < (lens.size() - 1);
           }))
            return;
+        if(broadcasts.size() > 1)
+        {
+            auto bcast_strides = broadcasts.front()->get_shape().strides().size();
+            std::vector<size_t> common_axis(bcast_strides, 0);
+            // go through the strides of each broadcast,
+            // keep track of values that are equal to 0 in a dimension
+            for(auto i = 0; i < bcast_strides; i++)
+            {
+                for(const auto& broadcast : broadcasts)
+                {
+                    if(broadcast->get_shape().strides()[i] == 0)
+                        common_axis[i]++;
+                }
+            }
+            // if no common broadcast axis, transformation is not useful
+            if(std::find_if(common_axis.begin(), common_axis.end(), [](auto num_common) {
+                   return num_common > 1;
+               }) == common_axis.end())
+                return;
+        }
+
        std::vector<instruction_ref> inputs;
        std::transform(broadcasts.begin(),
                       broadcasts.end(),

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -632,6 +632,9 @@ struct find_transpose_contiguous_reshaper_unary
    }
 };

+// simplifies broadcast->transpose to transpose->broadcast
+// in the case of a scalar, simply rewrite to broadcast
+// this can allow for further optimizations with find_inner_broadcast() in simplify_algebra.cpp
 struct find_broadcast_transpose
 {
    auto matcher() const
@@ -642,17 +645,30 @@ struct find_broadcast_transpose

    void apply(module& m, const match::matcher_result& r) const
    {
-        auto ins       = r.result;
-        auto ins_lens  = ins->get_shape().lens();
+        auto transpose      = r.result;
+        auto transpose_lens = transpose->get_shape().lens();
        auto bcast_ins = r.instructions["bcast_ins"];
        auto input     = bcast_ins->inputs().front();
-        // for now, focusing on scalar transformation
+        // scalar transformation does not need extra transpose
        if(not input->get_shape().scalar())
-            return;
-
+        {
+            // find common shape
+            auto in_lens  = input->get_shape().lens();
+            int lens_diff = transpose_lens.size() - in_lens.size();
+            // insert unsqueeze if input lens < transpose lens
+            if(lens_diff > 0)
+            {
+                std::vector<size_t> unsqueeze_axes(lens_diff);
+                std::iota(unsqueeze_axes.begin(), unsqueeze_axes.end(), 0);
+                input = m.insert_instruction(
+                    bcast_ins, make_op("unsqueeze", {{"axes", unsqueeze_axes}}), input);
+            }
+            // apply transpose before the multibroadcast
+            input = m.insert_instruction(bcast_ins, transpose->get_operator(), input);
+        }
        auto new_mbcast = m.insert_instruction(
-            bcast_ins, make_op("multibroadcast", {{"out_lens", ins_lens}}), input);
-        m.replace_instruction(ins, new_mbcast);
+            bcast_ins, make_op("multibroadcast", {{"out_lens", transpose_lens}}), input);
+        m.replace_instruction(transpose, new_mbcast);
    }
 };


--- a/src/targets/cpu/include/migraphx/cpu/fuse_ops.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/fuse_ops.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_CPU_FUSE_OPS_HPP
 #define MIGRAPHX_GUARD_CPU_FUSE_OPS_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/cpu/context.hpp>
 #include <string>

 namespace migraphx {
@@ -34,9 +34,7 @@ struct module;

 namespace cpu {

-struct context;
-
-struct fuse_ops
+struct MIGRAPHX_CPU_EXPORT fuse_ops
 {
    context* ctx = nullptr;
    std::string name() const { return "cpu::fuse_ops"; }

--- a/src/targets/gpu/fuse_ck.cpp
+++ b/src/targets/gpu/fuse_ck.cpp
@@ -22,9 +22,9 @@
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/fuse_ck.hpp>
+#include <migraphx/gpu/gemm_softmax_gemm.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/pass_manager.hpp>
-#include <migraphx/make_op.hpp>
 #include <migraphx/register_op.hpp>

 namespace migraphx {
@@ -55,7 +55,7 @@ struct ck_gemm
    {
        check_shapes{inputs, *this}.same_ndims();
        if(inputs.size() < 2)
-            MIGRAPHX_THROW("should have at least two inputs.");
+            MIGRAPHX_THROW(name() + ": should have at least two inputs.");
        auto a = inputs[0];
        auto b = inputs[1];
        for(const auto& input : inputs)
@@ -65,21 +65,27 @@ struct ck_gemm
            return r;
        return r.with_type(mods.front()->get_output_shapes().front().type());
    }
+
+    static bool is_ck_supported_type(shape::type_t t)
+    {
+        return contains({shape::half_type, shape::int8_type, shape::int32_type}, t);
+    }
 };
 MIGRAPHX_REGISTER_OP(ck_gemm);

-namespace {
-
-bool is_ck_supported_type(shape::type_t t)
+struct ck_gemm_softmax_gemm : gemm_softmax_gemm
 {
-    return contains({shape::half_type, shape::int8_type, shape::int32_type}, t);
-}
+    std::string name() const { return "gpu::ck_gemm_softmax_gemm"; }
+};
+MIGRAPHX_REGISTER_OP(ck_gemm_softmax_gemm);
+
+namespace {

 MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
 {
    if(ins->name() != "dot" and ins->name() != "quant_dot")
        return false;
-    if(not is_ck_supported_type(ins->get_shape().type()))
+    if(not ck_gemm::is_ck_supported_type(ins->get_shape().type()))
        return false;
    auto a = ins->inputs().front()->get_shape();
    auto b = ins->inputs().back()->get_shape();
@@ -127,7 +133,11 @@ struct find_ck_gemm_pointwise
           ins->get_shape().type() != gemm_ins->get_shape().type())
            return;
        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
-               return not is_ck_supported_type(input->get_shape().type());
+               return not ck_gemm::is_ck_supported_type(input->get_shape().type());
+           }))
+            return;
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
+               return not input->inputs().empty() and input->inputs().front()->name() == "capture";
           }))
            return;
        assert(gemm_it != inputs.end());
@@ -152,7 +162,7 @@ struct find_ck_gemm_pointwise

 struct find_ck_gemm
 {
-    auto matcher() const { return match::name("dot")(is_ck_gemm().bind("gemm")); }
+    auto matcher() const { return match::name("dot", "quant_dot")(is_ck_gemm().bind("gemm")); }

    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
    {
@@ -161,11 +171,26 @@ struct find_ck_gemm
    }
 };

+struct find_ck_gemm_softmax_gemm
+{
+    auto matcher() const { return match::name("gpu::pre_gemm_softmax_gemm"); }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+        auto v   = ins->get_operator().to_value();
+        assert(v.contains("scale"));
+        auto scale = v.at("scale").to<float>();
+        mpm.get_module().replace_instruction(
+            ins, ck_gemm_softmax_gemm{migraphx::make_op("dot"), scale}, ins->inputs());
+    }
+};
+
 } // namespace

 void fuse_ck::apply(module_pass_manager& mpm) const
 {
-    match::find_matches(mpm, find_ck_gemm_pointwise{});
+    match::find_matches(mpm, find_ck_gemm_softmax_gemm{}, find_ck_gemm_pointwise{});
    match::find_matches(mpm, find_ck_gemm{});
 }


--- a/src/targets/gpu/include/migraphx/gpu/ck.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_CK_HPP
+#define MIGRAPHX_GUARD_GPU_CK_HPP
+
+#include <migraphx/compile_src.hpp>
+#include <migraphx/env.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/stringutils.hpp>
+#include <string_view>
+
+#include "ck/host/device_gemm_multiple_d.hpp"
+#include "ck/host/device_batched_gemm_softmax_gemm.hpp"
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+#ifndef _WIN32
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK);
+#endif
+
+// NOLINTNEXTLINE
+const char* const disable_warning_pragma = R"__migraphx__(
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+${content}
+#pragma clang diagnostic pop
+)__migraphx__";
+
+template <class P>
+std::string ck_disable_warnings(P p)
+{
+    return interpolate_string(disable_warning_pragma,
+                              {{"content", std::string{p.data(), p.size()}}});
+}
+
+static std::unordered_map<std::string, std::string> create_ck_header_strings()
+{
+    std::unordered_map<std::string, std::string> result;
+    auto ck_headers = ck::host::GetHeaders();
+
+    std::transform(
+        ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto& p) {
+            return std::pair<std::string, std::string>(p.first, ck_disable_warnings(p.second));
+        });
+    return result;
+}
+
+static std::vector<src_file> create_ck_headers()
+{
+    static const auto& header_strings = create_ck_header_strings();
+    std::vector<src_file> srcs;
+    std::transform(header_strings.begin(),
+                   header_strings.end(),
+                   std::back_inserter(srcs),
+                   [&](auto& p) { return src_file{p}; });
+    return srcs;
+}
+
+static inline const std::vector<src_file>& ck_headers()
+{
+    static const auto& headers = create_ck_headers();
+    return headers;
+}
+
+inline bool transposed_matrix(const shape& s) { return s.strides().back() != 1; }
+
+inline ck::host::DataType get_type(const shape& s)
+{
+    if(s.type() == shape::half_type)
+        return ck::host::DataType::Half;
+    else if(s.type() == shape::float_type)
+        return ck::host::DataType::Float;
+    else if(s.type() == shape::int8_type)
+        return ck::host::DataType::Int8;
+    else if(s.type() == shape::int32_type)
+        return ck::host::DataType::Int32;
+    MIGRAPHX_THROW("Unsupported ck type");
+}
+
+inline std::size_t get_batch_count(const shape& s)
+{
+    return std::accumulate(
+        s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
+}
+
+inline void fold_batch_dims(shape& s)
+{
+    auto lens = s.lens();
+    if(lens.size() <= 2)
+        return;
+    auto batch_count = get_batch_count(s);
+    auto m1          = lens.at(lens.size() - 2);
+    auto m2          = lens.at(lens.size() - 1);
+    if(transposed_matrix(s))
+        s = shape{s.type(), {m1, m2 * batch_count}};
+    else
+        s = shape{s.type(), {m1 * batch_count, m2}};
+}
+
+inline void remove_batch_dims(shape& s)
+{
+    auto lens = s.lens();
+    if(lens.size() <= 2)
+        return;
+    auto m1 = lens.at(lens.size() - 2);
+    auto m2 = lens.at(lens.size() - 1);
+    s       = shape{s.type(), {m1, m2}};
+}
+
+inline bool standard_batch(const shape& s)
+{
+    if(s.lens().size() < 3)
+        return true;
+    std::vector<std::size_t> lens(s.lens().begin(), s.lens().end() - 2);
+    std::vector<std::size_t> strides(s.strides().begin(), s.strides().end() - 2);
+    auto base = *(s.lens().end() - 2) * *(s.lens().end() - 1);
+    std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto stride) {
+        return stride / base;
+    });
+    return shape{s.type(), lens, strides}.standard();
+}
+
+inline bool can_fold_batch(const std::vector<shape>& inputs)
+{
+    const auto& b_shape = inputs[1];
+    if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) {
+           return not standard_batch(input);
+       }))
+        return false;
+    const auto& b_strides = b_shape.strides();
+    return std::all_of(
+        b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; });
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_CK_HPP
--- a/src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
@@ -24,7 +24,6 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_FUSE_OPS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_FUSE_OPS_HPP

-#include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -34,7 +33,7 @@ struct module;

 namespace gpu {

-struct fuse_ops
+struct MIGRAPHX_GPU_EXPORT fuse_ops
 {
    context* ctx   = nullptr;
    bool fast_math = true;

--- a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP
+#define MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP
+
+#include <migraphx/make_op.hpp>
+#include <migraphx/check_shapes.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct gemm_softmax_gemm
+{
+    operation op = make_op("dot");
+    float scale  = 1.0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"), f(self.scale, "scale"));
+    }
+
+    std::string name() const { return "gpu::gemm_softmax_gemm"; }
+
+    void check_gemm_shape(const shape& s) const
+    {
+        if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1))
+            MIGRAPHX_THROW("Invalid shape for " + name());
+    }
+
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>&) const
+    {
+        check_shapes{inputs, *this}.same_ndims();
+        if(inputs.size() < 3)
+            MIGRAPHX_THROW(name() + ": Expected 3 inputs but got " + to_string(inputs.size()));
+        auto a  = inputs[0];
+        auto b  = inputs[1];
+        auto b1 = inputs[2];
+        for(const auto& input : inputs)
+        {
+            check_gemm_shape(input);
+        }
+        return op.compute_shape({op.compute_shape({a, b}), b1});
+    }
+
+    static bool is_ck_supported_type(shape::type_t t) { return contains({shape::half_type}, t); }
+};
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP
--- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
 #define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP

-#include <migraphx/config.hpp>
+#include <migraphx/gpu/config.hpp>
 #include <string>

 namespace migraphx {
@@ -34,7 +34,7 @@ struct module_pass_manager;

 namespace gpu {

-struct prefuse_ops
+struct MIGRAPHX_GPU_EXPORT prefuse_ops
 {
    std::string name() const { return "gpu::prefuse_ops"; }
    void apply(module_pass_manager& mpm) const;

--- a/src/targets/gpu/jit/ck_gemm.cpp
+++ b/src/targets/gpu/jit/ck_gemm.cpp
@@ -27,6 +27,7 @@
 #include <migraphx/make_op.hpp>
 #include <migraphx/gpu/context.hpp>

+#include <migraphx/gpu/ck.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/file_buffer.hpp>
 #include <migraphx/gpu/compile_gen.hpp>
@@ -37,8 +38,6 @@
 #include <migraphx/reduce_dims.hpp>
 #include <migraphx/stringutils.hpp>

-#include "ck/host/device_gemm_multiple_d.hpp"
-
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

@@ -46,12 +45,6 @@ namespace gpu {

 using namespace migraphx::gpu::gen; // NOLINT

-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM);
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING);
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING_VALUE);
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG);
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK);
-
 // NOLINTNEXTLINE
 static const char* const ck_gemm_kernel = R"__migraphx__(
 #include <args.hpp>
@@ -79,219 +72,10 @@ MIGRAPHX_GLOBAL void ${kernel}(${params})

 )__migraphx__";

-// NOLINTNEXTLINE
-static const char* const disable_warning_pragma = R"__migraphx__(
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Weverything"
-${content}
-#pragma clang diagnostic pop
-)__migraphx__";
-
-template <class P>
-static std::string ck_disable_warnings(P p)
-{
-    return interpolate_string(disable_warning_pragma,
-                              {{"content", std::string{p.first, p.second}}});
-}
-
-static std::unordered_map<std::string, std::string> create_ck_header_strings()
-{
-    std::unordered_map<std::string, std::string> result;
-    auto ck_headers = ck::host::GetHeaders();
-
-    std::transform(
-        ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto&& p) {
-            return std::make_pair(p.first, ck_disable_warnings(p.second));
-        });
-    return result;
-}
-
-static std::vector<src_file> create_ck_headers()
-{
-    static const auto& header_strings = create_ck_header_strings();
-    std::vector<src_file> srcs;
-    std::transform(
-        header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto&& p) {
-            return src_file{p.first, p.second};
-        });
-    return srcs;
-}
-
-static const std::vector<src_file>& ck_headers()
-{
-    static const auto& headers = create_ck_headers();
-    return headers;
-}
-
-static bool transposed_matrix(const shape& s) { return s.strides().back() != 1; }
-
-using tuning_entry = std::pair<std::vector<shape>, size_t>;
-static std::vector<tuning_entry> read_tuning(const std::string& s)
-{
-    if(not fs::exists(s))
-        return {};
-    return from_value<std::vector<tuning_entry>>(from_json_string(read_string(s)));
-}
-
-static float matrix_distance(const shape& x, const shape& y)
-{
-    if(x.type() != y.type())
-        return std::numeric_limits<float>::max();
-    if(transposed_matrix(x) != transposed_matrix(y))
-        return std::numeric_limits<float>::max();
-    auto sum_squared = std::inner_product(x.lens().rbegin(),
-                                          x.lens().rbegin() + 2,
-                                          y.lens().rbegin(),
-                                          0,
-                                          std::plus<>{},
-                                          [](auto a, auto b) { return (a - b) * (a - b); });
-    return std::sqrt(sum_squared);
-}
-
-static std::size_t get_tuning_for(const std::vector<shape>& inputs)
-{
-    static auto tuning = read_tuning(string_value_of(MIGRAPHX_CK_TUNING{}, ""));
-    if(tuning.empty())
-    {
-        std::cout << "*********** Warning: No CK tuning! for config:" << std::endl;
-        std::cout << "  " << inputs[0] << std::endl;
-        std::cout << "  " << inputs[1] << std::endl;
-        std::cout << "  " << inputs[2] << std::endl;
-    }
-    auto it = std::find_if(
-        tuning.begin(), tuning.end(), [&](const auto& p) { return p.first == inputs; });
-    if(it == tuning.end())
-    {
-        std::cout << "*********** Warning: CK tuning missing for config!" << std::endl;
-        std::cout << "  " << inputs[0] << std::endl;
-        std::cout << "  " << inputs[1] << std::endl;
-        std::cout << "  " << inputs[2] << std::endl;
-        std::vector<std::pair<float, std::size_t>> w;
-        std::transform(tuning.begin(), tuning.end(), std::back_inserter(w), [&](const auto& p) {
-            if(inputs.size() < 3 or p.first.size() < 3)
-                MIGRAPHX_THROW("Invalid CK config");
-            auto avg_distance = std::inner_product(
-                p.first.begin(),
-                p.first.begin() + 3,
-                inputs.begin(),
-                0.0f,
-                std::plus<>{},
-                [](const auto& x, const auto& y) { return matrix_distance(x, y) / 3.0f; });
-            return std::make_pair(avg_distance, p.second);
-        });
-        std::sort(w.begin(), w.end());
-        std::size_t default_value = 4;
-        if(not w.empty())
-            default_value = w.front().second;
-        auto tuning_val = value_of(MIGRAPHX_CK_TUNING_VALUE{}, default_value);
-        std::cout << "*********** Warning: CK try tuning: " << tuning_val << std::endl;
-        return tuning_val;
-    }
-    return it->second;
-}
-
 struct ck_gemm_compiler : compiler<ck_gemm_compiler>
 {
-    static std::string get_layout(const shape& s)
-    {
-        return transposed_matrix(s) ? "ck::tensor_layout::gemm::ColumnMajor"
-                                    : "ck::tensor_layout::gemm::RowMajor";
-    }
-
-    static ck::host::DataType get_type(const shape& s)
-    {
-        if(s.type() == shape::half_type)
-            return ck::host::DataType::Half;
-        else if(s.type() == shape::float_type)
-            return ck::host::DataType::Float;
-        else if(s.type() == shape::int8_type)
-            return ck::host::DataType::Int8;
-        else if(s.type() == shape::int32_type)
-            return ck::host::DataType::Int32;
-        MIGRAPHX_THROW("Unsupported ck type");
-    }
-
-    template <class Iterator, class F>
-    static std::string ck_tuple(Iterator start, Iterator last, F f)
-    {
-        std::vector<std::string> s;
-        std::transform(start, last, std::back_inserter(s), f);
-        return "ck::Tuple<" + join_strings(s, ",") + ">";
-    }
-
-    static std::vector<shape> adjust_inputs(std::vector<shape> inputs, bool& swap_inputs)
-    {
-        swap_inputs  = false;
-        auto c_shape = inputs.back();
-        if(not transposed_matrix(c_shape))
-            return inputs;
-        std::vector<int64_t> perm(c_shape.lens().size());
-        std::iota(perm.begin(), perm.end(), 0);
-        std::swap(perm[perm.size() - 1], perm[perm.size() - 2]);
-        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](shape s) {
-            return reorder_shape(s, perm);
-        });
-        swap_inputs = true;
-        return inputs;
-    }
-
-    static std::size_t get_batch_count(const shape& s)
-    {
-        return std::accumulate(
-            s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
-    }
-
-    static void fold_batch_dims(shape& s)
-    {
-        auto lens = s.lens();
-        if(lens.size() <= 2)
-            return;
-        auto batch_count = get_batch_count(s);
-        auto m1          = lens.at(lens.size() - 2);
-        auto m2          = lens.at(lens.size() - 1);
-        if(transposed_matrix(s))
-            s = shape{s.type(), {m1, m2 * batch_count}};
-        else
-            s = shape{s.type(), {m1 * batch_count, m2}};
-    }
-
-    static void remove_batch_dims(shape& s)
-    {
-        auto lens = s.lens();
-        if(lens.size() <= 2)
-            return;
-        auto m1 = lens.at(lens.size() - 2);
-        auto m2 = lens.at(lens.size() - 1);
-        s       = shape{s.type(), {m1, m2}};
-    }
-
    std::vector<std::string> names() const { return {"ck_gemm", "gpu::ck_gemm"}; }

-    static bool standard_batch(const shape& s)
-    {
-        if(s.lens().size() < 3)
-            return true;
-        std::vector<std::size_t> lens(s.lens().begin(), s.lens().end() - 2);
-        std::vector<std::size_t> strides(s.strides().begin(), s.strides().end() - 2);
-        auto base = *(s.lens().end() - 2) * *(s.lens().end() - 1);
-        std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto stride) {
-            return stride / base;
-        });
-        return shape{s.type(), lens, strides}.standard();
-    }
-
-    bool can_fold_batch(const std::vector<shape>& inputs) const
-    {
-        const auto& b_shape = inputs[1];
-        if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) {
-               return not standard_batch(input);
-           }))
-            return false;
-        const auto& b_strides = b_shape.strides();
-        return std::all_of(
-            b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; });
-    }
-
    ck::host::device_gemm_multiple_d::Problem create_problem(const std::vector<shape>& inputs,
                                                             const value& v) const
    {
@@ -300,8 +84,7 @@ struct ck_gemm_compiler : compiler<ck_gemm_compiler>
        const auto& c_shape = inputs.back();

        // cppcheck-suppress unreadVariable
-        auto rank = a_shape.ndim();
-
+        auto rank        = a_shape.ndim();
        auto batch_count = get_batch_count(c_shape);
        auto m           = c_shape.lens()[rank - 2];
        m                = can_fold_batch(inputs) ? m * batch_count : m;
@@ -351,12 +134,8 @@ struct ck_gemm_compiler : compiler<ck_gemm_compiler>

    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
    {
-        const auto& a_shape = inputs[0];
-        const auto& b_shape = inputs[1];
        const auto& c_shape = inputs.back();
-        auto tuning_value   = v.get("tuning_value", 4);
-        if(not v.contains("tuning_value"))
-            tuning_value = get_tuning_for({a_shape, b_shape, c_shape});
+        auto tuning_value   = v.get("tuning_value", 34);
        auto batch_count = get_batch_count(c_shape);
        auto problem     = create_problem(inputs, v);


--- a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp
+++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <fstream>
+#include <migraphx/filesystem.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/env.hpp>
+#include <migraphx/file_buffer.hpp>
+#include <migraphx/gpu/ck.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+// NOLINTNEXTLINE
+static const char* const ck_gemm_softmax_gemm_kernel = R"__migraphx__(
+#include <args.hpp>
+#include <migraphx/kernels/ck_gemm_softmax_gemm.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <${include}>
+
+namespace migraphx {
+
+${preamble}
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void ${kernel}(${params})
+{
+    transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) {
+        auto settings = make_ck_gemm_softmax_gemm_settings(MIGRAPHX_MAKE_CONSTANT(float{SCALE}));
+        ck_gemm_softmax_gemm<${solution}, ${blocks_per_batch}>(settings, xs...);
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct ck_gemm_softmax_gemm_compiler : compiler<ck_gemm_softmax_gemm_compiler>
+{
+    std::vector<std::string> names() const
+    {
+        return {"ck_gemm_softmax_gemm", "gpu::ck_gemm_softmax_gemm"};
+    }
+
+    ck::host::device_batched_gemm_softmax_gemm::Problem
+    create_problem(const std::vector<shape>& inputs, const value&) const
+    {
+        const auto& a_shape  = inputs[0];
+        const auto& b_shape  = inputs[1];
+        const auto& b1_shape = inputs[2];
+        const auto& c_shape  = inputs.back();
+
+        // cppcheck-suppress unreadVariable
+        auto rank        = a_shape.ndim();
+        auto batch_count = get_batch_count(c_shape);
+        auto m           = c_shape.lens()[rank - 2];
+        m                = can_fold_batch(inputs) ? m * batch_count : m;
+        auto n           = c_shape.lens().back();
+        auto k           = a_shape.lens().back();
+        auto o           = c_shape.lens().back();
+
+        const bool trans_a  = transposed_matrix(a_shape);
+        const bool trans_b  = transposed_matrix(b_shape);
+        const bool trans_b1 = transposed_matrix(b1_shape);
+        const bool trans_c  = transposed_matrix(c_shape);
+        const auto a_type   = get_type(a_shape);
+        const auto b_type   = get_type(b_shape);
+        const auto b1_type  = get_type(b1_shape);
+        const auto c_type   = get_type(c_shape);
+
+        std::string ck_passthrough = "ck_passthrough";
+        return ck::host::device_batched_gemm_softmax_gemm::Problem{m,
+                                                                   n,
+                                                                   k,
+                                                                   o,
+                                                                   trans_a,
+                                                                   trans_b,
+                                                                   trans_b1,
+                                                                   trans_c,
+                                                                   a_type,
+                                                                   b_type,
+                                                                   b1_type,
+                                                                   c_type,
+                                                                   ck_passthrough,
+                                                                   ck_passthrough,
+                                                                   ck_passthrough,
+                                                                   ck_passthrough};
+    }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        const auto& c_shape = inputs.back();
+        auto tuning_value   = v.get("tuning_value", 5);
+        auto batch_count    = get_batch_count(c_shape);
+        auto problem        = create_problem(inputs, v);
+
+        const auto include_header   = problem.GetIncludeHeader();
+        const auto solutions        = problem.GetSolutions(ctx.get_current_device().get_gfx_name());
+        const auto& solution        = solutions.at(tuning_value);
+        const auto template_str     = solution.template_str;
+        const auto blocks_per_batch = solution.grid_size;
+        const auto block_size       = solution.block_size;
+
+        hip_compile_options options;
+        options.additional_src_files = ck_headers();
+        auto grid_size = can_fold_batch(inputs) ? blocks_per_batch : batch_count * blocks_per_batch;
+        options.set_launch_params(v, grid_size * block_size, block_size);
+        options.inputs         = inputs;
+        options.output         = c_shape;
+        options.kernel_name    = v.get("kernel", "ck_gemm_softmax_gemm_kernel");
+        options.virtual_inputs = inputs;
+        if(can_fold_batch(inputs))
+        {
+            auto vinputs = inputs;
+            fold_batch_dims(vinputs[0]);
+            remove_batch_dims(vinputs[1]);
+            std::for_each(vinputs.begin() + 2, vinputs.end(), fold_batch_dims);
+            options.virtual_inputs = vinputs;
+        }
+
+        if(v.get("check", false) or enabled(MIGRAPHX_CK_DEBUG{}))
+            options.params += " -DMIGRAPHX_CK_CHECK=1";
+
+        // scale
+        assert(v.contains("scale"));
+        auto scale = v.at("scale").to<float>();
+        options.params += " -DSCALE=" + std::to_string(scale);
+
+        auto src = interpolate_string(ck_gemm_softmax_gemm_kernel,
+                                      {{"solution", template_str},
+                                       {"include", include_header},
+                                       {"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"blocks_per_batch", to_string(blocks_per_batch)},
+                                       {"preamble", v.get("preamble", std::string{})},
+                                       {"kernel", options.kernel_name}});
+
+        return compile_hip_code_object(src, options);
+    }
+
+    value create_settings(instruction_ref ins, const operation& op) const
+    {
+        auto v      = op.to_value();
+        v["kernel"] = "ck_gemm_softmax_gemm_kernel";
+        if(not ins->module_inputs().empty())
+        {
+            auto* pm      = ins->module_inputs().front();
+            v["preamble"] = generate_pointwise(*pm, "post_ck_gemm_softmax_gemm_function") +
+                            "\nMIGRAPHX_LIFT_CLASS(post_ck_gemm_softmax_gemm, "
+                            "post_ck_gemm_softmax_gemm_function);";
+            v["post"]   = "ck_function_adaptor<post_ck_gemm_softmax_gemm>";
+            v["kernel"] = "ck_gemm_softmax_gemm_" + generate_name_from_ops(*pm) + "_kernel";
+        }
+        return v;
+    }
+
+    compiler_replace
+    compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto v      = create_settings(ins, op);
+        if(not solution.is_null())
+            v["tuning_value"] = solution;
+        return {compile_op(ctx, shapes, v),
+                [=](module& m, instruction_ref ins2, const operation& code_object) {
+                    if(enabled(MIGRAPHX_LOG_CK_GEMM{}))
+                    {
+                        std::vector<shape> gemm_shapes{
+                            shapes[0], shapes[1], shapes.back().with_type(shapes[0].type())};
+                        std::cout << "gpu::ck_gemm_softmax_gemm: "
+                                  << to_json_string(to_value(gemm_shapes)) << std::endl;
+                    }
+                    m.replace_instruction(ins2, code_object, ins2->inputs());
+                }};
+    }
+
+    optional<tuning_config>
+    get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive) const
+    {
+        if(not exhaustive and not enabled(MIGRAPHX_TUNE_CK{}))
+            return nullopt;
+        tuning_config tc;
+        auto shapes    = to_shapes(ins->inputs());
+        auto problem   = create_problem(shapes, create_settings(ins, op));
+        auto solutions = problem.GetSolutions(ctx.get_current_device().get_gfx_name());
+        tc.solutions.resize(solutions.size());
+        std::iota(tc.solutions.begin(), tc.solutions.end(), 0);
+        std::vector<shape> gemm_shapes{shapes[0], shapes[1], shapes.back()};
+        tc.problem = to_value(gemm_shapes);
+        return tc;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp
@@ -154,6 +154,17 @@ struct ck_add
    }
 };

+// In CK, the B matrix is ordered as N,K instead of K,N
+template <class Dims>
+constexpr auto ck_transposeb_dims(Dims dims)
+{
+    return unpack(dims, [](auto k, auto n) { return make_const_array(n, k); });
+}
+
+template <class Tensor>
+using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c<Tensor>{}.lens),
+                                          ck_transposeb_dims(get_shape_c<Tensor>{}.strides)));
+
 #ifdef MIGRAPHX_CK_CHECK
 #define MIGRAPHX_CK_STATIC_ASSERT static_assert
 #else

--- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
@@ -33,17 +33,6 @@

 namespace migraphx {

-// In CK, the B matrix is ordered as N,K instead of K,N
-template <class Dims>
-constexpr auto ck_transposeb_dims(Dims dims)
-{
-    return unpack(dims, [](auto k, auto n) { return make_const_array(n, k); });
-}
-
-template <class Tensor>
-using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c<Tensor>{}.lens),
-                                          ck_transposeb_dims(get_shape_c<Tensor>{}.strides)));
-
 template <class G, class E, class A, class B, class... Ds>
 __device__ void ck_gemm_matrix(E e, A a, B b, Ds... ds)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_CK_GEMM_SOFTMAX_GEMM_HPP
+#define MIGRAPHX_GUARD_KERNELS_CK_GEMM_SOFTMAX_GEMM_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/ck.hpp>
+#include <migraphx/kernels/gemm_batcher.hpp>
+
+namespace migraphx {
+
+template <class T>
+struct ck_gemm_softmax_gemm_settings
+{
+    T scale{};
+};
+
+template <class... Ts>
+constexpr ck_gemm_softmax_gemm_settings<Ts...> make_ck_gemm_softmax_gemm_settings(Ts... xs)
+{
+    return {xs...};
+}
+
+template <class G, class C, class A, class B, class B1, class Settings>
+__device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1, Settings s)
+{
+    constexpr auto desc = G::make_descriptor(to_ck_tensor<A>(),
+                                             to_ck_tensor<ck_transposeb<B>>(),
+                                             to_ck_tensor<ck_transposeb<B1>>(),
+                                             to_ck_tensor<C>());
+
+    static_assert(desc.IsValid(), "Invalid ck gemm.");
+
+    G::Run(desc,
+           s.scale,
+           to_ck_const_pointer(a.data()),
+           to_ck_const_pointer(b.data()),
+           to_ck_const_pointer(b1.data()),
+           to_ck_pointer(c.data()));
+}
+
+template <class G, index_int BlocksPerBatch, class... Ts, class Settings>
+__device__ void ck_gemm_softmax_gemm(Settings s, Ts... xs)
+{
+    gemm_batch_args(make_index(), _c<BlocksPerBatch>, xs...)(
+        [&](auto... ys) { ck_gemm_softmax_gemm_matrix<G>(ys..., s); });
+}
+
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -23,16 +23,17 @@
 */
 #include <migraphx/permutation.hpp>
 #include <migraphx/gpu/prefuse_ops.hpp>
+#include <migraphx/gpu/gemm_softmax_gemm.hpp>
 #include <migraphx/match/layernorm.hpp>
-#include <migraphx/check_shapes.hpp>
-#include <migraphx/make_op.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/gpu/ck.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
+
 namespace {

 template <class Derived, std::size_t N>
@@ -120,6 +121,60 @@ struct find_add_layernorm
        m.replace_instruction(ins, add_layernorm{op.epsilon}, add_ins->inputs());
    }
 };
+
+struct pre_gemm_softmax_gemm : gemm_softmax_gemm
+{
+    std::string name() const { return "gpu::pre_gemm_softmax_gemm"; }
+};
+MIGRAPHX_REGISTER_OP(pre_gemm_softmax_gemm);
+
+MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
+{
+    if(ins->name() != "dot")
+        return false;
+    if(not pre_gemm_softmax_gemm::is_ck_supported_type(ins->get_shape().type()))
+        return false;
+    return true;
+}
+
+struct find_gemm_softmax_gemm
+{
+    auto matcher() const
+    {
+        auto gemm1 =
+            match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1")));
+        auto mul = match::name("mul")(
+            match::nargs(2), match::either_arg(0, 1)(match::is_constant().bind("scale"), gemm1));
+        auto softmax = match::name("softmax")(match::arg(0)(mul)).bind("softmax");
+
+        return match::name("dot")(is_ck_gemm().bind("gemm2"))(match::arg(0)(softmax));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins       = r.result;
+        auto gemm2_ins = r.instructions["gemm2"];
+        auto gemm1_ins = r.instructions["gemm1"];
+        auto scale_lit = r.instructions["scale"];
+
+        float scale = 1.0;
+        scale_lit->eval().visit([&](const auto s) {
+            // CK only supports single-valued scale
+            if(std::all_of(
+                   s.begin() + 1, s.end(), [&](auto v) { return float_equal(v, s.front()); }))
+                scale = s.front();
+            else
+                return;
+        });
+
+        auto inputs = gemm1_ins->inputs();            // A, B
+        inputs.push_back(gemm2_ins->inputs().back()); // B1
+
+        mpm.get_module().replace_instruction(
+            ins, pre_gemm_softmax_gemm{gemm2_ins->get_operator(), scale}, inputs);
+    }
+};
+
 } // namespace

 void prefuse_ops::apply(module_pass_manager& mpm) const
@@ -127,6 +182,8 @@ void prefuse_ops::apply(module_pass_manager& mpm) const
    match::find_matches(mpm.get_module(), find_layernorm{});
    mpm.run_pass(dead_code_elimination{});
    match::find_matches(mpm.get_module(), find_add_layernorm{});
+    if(enabled(MIGRAPHX_ENABLE_CK{}))
+        match::find_matches(mpm, find_gemm_softmax_gemm{});
 }

 } // namespace gpu