Merge

29820def · Paul · 6aa89319 · be33669b · 29820def · 29820def
Commit 29820def authored Sep 21, 2023 by Paul
20 changed files
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -8,6 +8,9 @@ on:
      - master
      - 'release/**'

+env:
+  DOCKER_USER: ${{secrets.DOCKERHUB_USERID}}
+  DOCKER_TOKEN: ${{secrets.DOCKERHUB_TOKEN}}

 jobs:
  cancel:
@@ -17,23 +20,93 @@ jobs:
        uses: styfle/cancel-workflow-action@0.11.0
        with:
          access_token: ${{ github.token }}
+
+  check_image:
+    name: Check if image exists in registry
+    runs-on: ubuntu-latest
+    outputs:
+      imageexists:  ${{ steps.check_image.outputs.imageexists }}
+      imagetag:  ${{ steps.image_hash.outputs.imagetag }}
+      imageexists_sles:  ${{ steps.check_image.outputs.imageexists_sles }}
+      imagetag_sles:  ${{ steps.image_hash.outputs.imagetag_sles }}
+
+    steps: 
+      - name: Checkout Code
+        uses: actions/checkout@v3
+
+      - name: Create Image Tag
+        id: image_hash
+        run: |
+          echo "imagetag=rocm/migraphx-private:hip-clang-${{hashFiles('**/hip-clang.docker', '**/*requirements.txt', '**/install_prereqs.sh', '**/rbuild.ini')}}" >> $GITHUB_OUTPUT
+          echo "imagetag_sles=rocm/migraphx-sles-private:hip-clang-${{hashFiles('**/tools/docker/sles.docker', '**/*requirements.txt', '**/install_prereqs.sh', '**/rbuild.ini')}}" >> $GITHUB_OUTPUT
+
+      - name: Check if image is built already
+        id: check_image
+        env:
+          DOCKERIMAGE: ${{ steps.image_hash.outputs.imagetag }}
+          DOCKERIMAGE_SLES: ${{ steps.image_hash.outputs.imagetag_sles }}
+        run: |
+          echo $DOCKER_TOKEN | docker login -u $DOCKER_USER --password-stdin
+
+          if [[ "$(docker manifest inspect $DOCKERIMAGE 2> /dev/null)" != "" ]]; then
+            echo "imageexists=true" >> $GITHUB_OUTPUT
+            echo "Image already exists, skip building available"
+          else
+            echo "imageexists=false" >> $GITHUB_OUTPUT
+            echo "Tag does not exist, build and publishing required"
+          fi
+
+          if [[ "$(docker manifest inspect $DOCKERIMAGE_SLES 2> /dev/null)" != "" ]]; then
+            echo "imageexists_sles=true" >> $GITHUB_OUTPUT
+            echo "SLES Image already exists, skip building available"
+          else
+            echo "imageexists_sles=false" >> $GITHUB_OUTPUT
+            echo "SLES Tag does not exist, build and publishing required"
+          fi
+
+
+  build_image:
+    name: Build image
+    runs-on: ROCM-Ubuntu
+    needs: check_image
+    if: ${{ needs.check_image.outputs.imageexists != 'true' }}
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Build and publish 
+      env:            
+        DOCKERIMAGE: ${{ needs.check_image.outputs.imagetag }}
+      run: |
+        echo $DOCKER_TOKEN | docker login -u $DOCKER_USER --password-stdin
+        docker build . --file hip-clang.docker --tag $DOCKERIMAGE;
+        docker push $DOCKERIMAGE;
+
+  build_SLES_image:
+    name: Build SLES image
+    runs-on: ROCM-Ubuntu
+    needs: check_image
+    if: ${{ needs.check_image.outputs.imageexists_sles != 'true' }}
+    steps:
+    - uses: actions/checkout@v3
+    - name: Build and publish SLES
+      env:            
+        DOCKERIMAGE_SLES: ${{ needs.check_image.outputs.imagetag_sles }}
+      run: |
+        echo $DOCKER_TOKEN | docker login -u $DOCKER_USER --password-stdin
+        docker build . --file tools/docker/sles.docker --tag $DOCKERIMAGE_SLES;
+        docker push $DOCKERIMAGE_SLES;
+
+
  tidy:
    runs-on: ROCM-Ubuntu
+    needs: [ build_image, check_image ]
+    env:
+      DOCKERIMAGE: ${{ needs.check_image.outputs.imagetag }}
+
+    if: ${{ !cancelled() && (needs.build_image.result == 'success' || needs.build_image.result == 'skipped') }}
    steps: 
    - uses: actions/checkout@v3

-    # In this step, this action saves a list of existing images,
-    # the cache is created without them in the post run.
-    # It also restores the cache if it exists.
-    - name: Docker layer cache
-      uses: jpribyl/action-docker-layer-caching@v0.1.1
-      with:
-        key: docker-layer-caching-migraphx-${{hashFiles('hip-clang.docker', '**/*requirements.txt', '**/install_prereqs.sh', 'rbuild.ini')}}
-        restore-keys:
-          docker-layer-caching-migraphx-
-      # Ignore the failure of a step and avoid terminating the job.
-      continue-on-error: true
-
    - name: Restore cache files for tidy
      uses: actions/cache/restore@v3 
      id: tidy_restore
@@ -41,13 +114,13 @@ jobs:
        path: tidy-cache
        key: tidy-cache-${{ github.ref }}
        restore-keys: tidy-cache-
-        
-    - name: Build the Docker image
+
+    - name: Docker Login
      run: |
-        docker build . --file hip-clang.docker --tag migraphx
+        echo $DOCKER_TOKEN | docker login -u $DOCKER_USER --password-stdin

-    - name: Clang tidy
-      shell: bash -c "docker run -i -v=$GITHUB_WORKSPACE:/data -w /data migraphx bash < {0}"
+    - name: Clang Tidy
+      shell: bash -c "docker run -i -v=$GITHUB_WORKSPACE:/data -w /data $DOCKERIMAGE bash < {0}"
      run: |
        mkdir build
        cd build
@@ -84,21 +157,14 @@ jobs:

  cppcheck:
    runs-on: ROCM-Ubuntu
+    needs: [ build_image, check_image ]
+    env:
+      DOCKERIMAGE: ${{ needs.check_image.outputs.imagetag }}
+
+    if: ${{ !cancelled() && (needs.build_image.result == 'success' || needs.build_image.result == 'skipped') }}
    steps:
    - uses: actions/checkout@v3

-    # In this step, this action saves a list of existing images,
-    # the cache is created without them in the post run.
-    # It also restores the cache if it exists.
-    - name: Docker layer cache
-      uses: jpribyl/action-docker-layer-caching@v0.1.1
-      with:
-        key: docker-layer-caching-migraphx-${{hashFiles('hip-clang.docker', '**/*requirements.txt', '**/install_prereqs.sh', 'rbuild.ini')}}
-        restore-keys:
-          docker-layer-caching-migraphx-
-      # Ignore the failure of a step and avoid terminating the job.
-      continue-on-error: true
-
    - name: Restore cache files for cppcheck
      id: cppcheck_restore
      uses: actions/cache/restore@v3
@@ -107,11 +173,12 @@ jobs:
        key: cppcheck-cache-${{ hashFiles('cppcheck.rules', 'CMakeLists.txt') }}-${{ github.ref }}
        restore-keys: cppcheck-cache-${{ hashFiles('cppcheck.rules', 'CMakeLists.txt') }}-

-    - name: Build the Docker image
-      run: docker build . --file hip-clang.docker --tag migraphx
+    - name: Docker Login
+      run: |
+        echo $DOCKER_TOKEN | docker login -u $DOCKER_USER --password-stdin

    - name: Cppcheck
-      shell: bash -c "docker run -i -v=$GITHUB_WORKSPACE:/data -w /data migraphx bash < {0}"
+      shell: bash -c "docker run -i -v=$GITHUB_WORKSPACE:/data -w /data $DOCKERIMAGE bash < {0}"
      run: |
        mkdir build
        cd build
@@ -142,29 +209,23 @@ jobs:


  format:
-    runs-on: ROCM-Ubuntu
+    runs-on: ubuntu-latest
+    needs: [ build_image, check_image ]
+    env:
+      DOCKERIMAGE: ${{ needs.check_image.outputs.imagetag }}
+
+    if: ${{ !cancelled() && (needs.build_image.result == 'success' || needs.build_image.result == 'skipped') }}
    steps:
    - uses: actions/checkout@v3
      with:
        fetch-depth: 0

-    # In this step, this action saves a list of existing images,
-    # the cache is created without them in the post run.
-    # It also restores the cache if it exists.
-    - name: Docker layer cache
-      uses: jpribyl/action-docker-layer-caching@v0.1.1
-      with:
-        key: docker-layer-caching-migraphx-${{hashFiles('hip-clang.docker', '**/*requirements.txt', '**/install_prereqs.sh', 'rbuild.ini')}}
-        restore-keys:
-          docker-layer-caching-migraphx-
-      # Ignore the failure of a step and avoid terminating the job.
-      continue-on-error: true
-
-    - name: Build the Docker image
-      run: docker build . --file hip-clang.docker --tag migraphx
+    - name: Docker Login
+      run: |
+        echo $DOCKER_TOKEN | docker login -u $DOCKER_USER --password-stdin

    - name: Check formatting
-      shell: bash -c "docker run -i -v=$GITHUB_WORKSPACE:/data -w /data migraphx bash < {0}"
+      shell: bash -c "docker run -i -v=$GITHUB_WORKSPACE:/data -w /data $DOCKERIMAGE bash < {0}"
      run: |
        set -e
        git config --global --add safe.directory /data
@@ -172,26 +233,16 @@ jobs:

  sles:
    runs-on: ROCM-Ubuntu
+    needs: [ build_SLES_image, check_image ]
+    env:
+      DOCKERIMAGE_SLES: ${{ needs.check_image.outputs.imagetag_sles }}    
+    
+    if: ${{ !cancelled() && (needs.build_SLES_image.result == 'success' || needs.build_SLES_image.result == 'skipped') }}      
    steps:
    - uses: actions/checkout@v3
      with:
        fetch-depth: 0

-    # In this step, this action saves a list of existing images,
-    # the cache is created without them in the post run.
-    # It also restores the cache if it exists.
-    - name: Docker layer cache
-      uses: jpribyl/action-docker-layer-caching@v0.1.1
-      with:
-        key: docker-layer-caching-migraphx-sles-${{hashFiles('hip-clang.docker', '**/*requirements.txt', '**/install_prereqs.sh', 'rbuild.ini')}}
-        restore-keys:
-          docker-layer-caching-migraphx-sles-
-      # Ignore the failure of a step and avoid terminating the job.
-      continue-on-error: true
-
-    - name: Build the Docker image
-      run: docker build . --file tools/docker/sles.docker --tag migraphx-sles
-
    - name: Restore cache files for ccache
      uses: actions/cache/restore@v3 
      id: ccache_restore
@@ -200,8 +251,12 @@ jobs:
        key: ccache-sles-${{ github.ref }}
        restore-keys: ccache-sles-

+    - name: Docker Login
+      run: |
+        echo $DOCKER_TOKEN | docker login -u $DOCKER_USER --password-stdin
+
    - name: Build migraphx
-      shell: bash -c "docker run -i -v=$GITHUB_WORKSPACE:/data -w /data migraphx-sles bash < {0}"
+      shell: bash -c "docker run -i -v=$GITHUB_WORKSPACE:/data -w /data $DOCKERIMAGE_SLES bash < {0}"
      run: |
        set -e
        export CCACHE_COMPRESSLEVEL=10

--- a/docs/dev_intro.rst
+++ b/docs/dev_intro.rst
@@ -131,7 +131,7 @@ In this case, we can create `argument <migraphx::argument>` objects directly fro
    std::vector<float> results_vector(64);
    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });

-    EXPECT(migraphx::verify::verify_range(results_vector, sol));
+    EXPECT(migraphx::verify::verify_rms_range(results_vector, sol));

 An `argument <migraphx::argument>` can handle memory buffers from either the GPU or the CPU.
 By default when running the `program <migraphx::program>`, buffers are allocated on the corresponding target.

--- a/docs/driver.rst
+++ b/docs/driver.rst
@@ -50,9 +50,17 @@ Runs reference and CPU or GPU implementations and checks outputs for consistency

 .. include:: ./driver/compile.rst

-.. option::  --tolerance [double]
+.. option::  --rms-tol [double]

-Tolerance for errors (Default: 80)
+Tolerance for RMS error (Default: 0.001)
+
+.. option::  --atol [double]
+
+Tolerance for elementwise absolute difference (Default: 0.001)
+
+.. option::  --rtol [double]
+
+Tolerance for elementwise relative difference (Default: 0.001)

 .. option::  -i, --per-instruction


--- a/examples/migraphx/migraphx_driver/README.md
+++ b/examples/migraphx/migraphx_driver/README.md
@@ -55,7 +55,9 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --exhaustive-tune                        | Enable exhaustive search to find fastest kernel           |
 | --fp16                                   | Quantize for fp16                                         |
 | --int8                                   | Quantize for int8                                         |
-| --tolerance                              | Tolerance for errors                                      |
+| --rms-tol                                | Tolerance for the RMS error (Default: 0.001)              |
+| --atol                                   | Tolerance for elementwise absolute difference (Default: 0.001) |
+| --rtol                                   | Tolerance for elementwise relative difference (Default: 0.001) |
 | --per-instruction \| -i                  | Verify each instruction                                   |
 | --reduce \| -r                           | Reduce program and verify                                 |
 | --iterations \| -n                       | Number of iterations to run for perf report               |

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -142,6 +142,7 @@ register_migraphx_ops(
    equal
    erf
    exp
+    fill
    flatten
    floor
    fmod

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -475,13 +475,15 @@ struct compiler
            {
                if(is_offload_copy_set(p) and not co.offload_copy)
                {
-                    std::cout << "MIGraphX program was likely compiled with offload_copy set, Try "
-                                 "passing "
-                                 "`--enable-offload-copy` if program run fails.\n";
+                    std::cout
+                        << "[WARNING]: MIGraphX program was likely compiled with offload_copy "
+                           "set, Try "
+                           "passing "
+                           "`--enable-offload-copy` if program run fails.\n";
                }
                else if(co.offload_copy)
                {
-                    std::cout << "MIGraphX program was likely compiled without "
+                    std::cout << "[WARNING]: MIGraphX program was likely compiled without "
                                 "offload_copy set, Try "
                                 "removing "
                                 "`--enable-offload-copy` flag if passed to driver, if program run "
@@ -534,13 +536,19 @@ struct params : command<params>
 struct verify : command<verify>
 {
    compiler c;
-    double tolerance     = 80;
+    migraphx::verify::tolerance tols;
    bool per_instruction = false;
    bool reduce          = false;
    void parse(argument_parser& ap)
    {
        c.parse(ap);
-        ap(tolerance, {"--tolerance"}, ap.help("Tolerance for errors"));
+        ap(tols.rms_tol, {"--rms-tol"}, ap.help("Tolerance for the RMS error (Default: 0.001)"));
+        ap(tols.atol,
+           {"--atol"},
+           ap.help("Tolerance for the elementwise absolute difference (Default: 0.001)"));
+        ap(tols.rtol,
+           {"--rtol"},
+           ap.help("Tolerance for the elementwise relative difference (Default: 0.001)"));
        ap(per_instruction,
           {"-i", "--per-instruction"},
           ap.help("Verify each instruction"),
@@ -565,15 +573,15 @@ struct verify : command<verify>

        if(per_instruction)
        {
-            verify_instructions(p, t, c.co, quantize, tolerance);
+            verify_instructions(p, t, c.co, quantize, tols);
        }
        else if(reduce)
        {
-            verify_reduced_program(p, t, c.co, quantize, m, tolerance);
+            verify_reduced_program(p, t, c.co, quantize, m, tols);
        }
        else
        {
-            verify_program(c.l.file, p, t, c.co, quantize, m, tolerance);
+            verify_program(c.l.file, p, t, c.co, quantize, m, tols);
        }
    }
 };

--- a/src/driver/verify.cpp
+++ b/src/driver/verify.cpp
@@ -77,24 +77,24 @@ void verify_program(const std::string& name,
                    compile_options options,
                    precision quantize,
                    const parameter_map& inputs,
-                    double tolerance)
+                    verify::tolerance tols)
 {
-    auto x = run_ref(p, inputs);
-    auto y = run_target(p, t, options, quantize, inputs);
+    auto ref_outs    = run_ref(p, inputs);
+    auto target_outs = run_target(p, t, options, quantize, inputs);

-    std::size_t output_num = x.size();
+    std::size_t output_num = ref_outs.size();
    for(std::size_t i = 0; i < output_num; ++i)
    {
-        if(x[i].get_shape().type() != y[i].get_shape().type() or
-           x[i].get_shape().lens() != y[i].get_shape().lens())
+        if(ref_outs[i].get_shape().type() != target_outs[i].get_shape().type() or
+           ref_outs[i].get_shape().lens() != target_outs[i].get_shape().lens())
        {
            std::cout << "FAILED: " << name << std::endl;
-            std::cout << "Shape mismatch {" << x[i].get_shape() << "} != {" << y[i].get_shape()
+            std::cout << "Shape mismatch {" << ref_outs[i].get_shape() << "} != {" << target_outs[i].get_shape()
                      << "}" << std::endl;
        }
        else
        {
-            verify_args(name, x[i], y[i], tolerance);
+            verify_args(name, target_outs[i], verify::expected{ref_outs[i]}, tols);
        }
    }
 }
@@ -103,7 +103,7 @@ void verify_instructions(const program& prog,
                         const target& t,
                         compile_options options,
                         precision quantize,
-                         double tolerance)
+                         verify::tolerance tols)
 {
    const auto* mm_prog = prog.get_main_module();
    for(auto&& ins : (*mm_prog))
@@ -134,8 +134,7 @@ void verify_instructions(const program& prog,
        {
            std::cout << "Verify: " << ins.name() << std::endl;
            std::cout << p << std::endl;
-            verify_program(
-                ins.name(), p, t, options, quantize, create_param_map(p, false), tolerance);
+            verify_program(ins.name(), p, t, options, quantize, create_param_map(p, false), tols);
        }
        catch(...)
        {
@@ -151,13 +150,14 @@ void verify_reduced(program p,
                    compile_options options,
                    precision quantize,
                    const parameter_map& inputs,
-                    double tolerance)
+                    verify::tolerance tols)
 {
    auto* mm  = p.get_main_module();
    auto last = std::prev(mm->end(), n);
    mm->remove_instructions(last, mm->end());
    std::cout << "Verify: " << n << std::endl;
    std::cout << p << std::endl;
+<<<<<<< HEAD
    try
    {
        verify_program(std::to_string(n), p, t, options, quantize, inputs, tolerance);
@@ -167,6 +167,9 @@ void verify_reduced(program p,
        std::cout << "FAILED: " << n << std::endl;
        std::cout << "Exception: " << e.what() << std::endl;
    }
+=======
+    verify_program(std::to_string(n), p, t, options, quantize, inputs, tols);
+>>>>>>> origin/thres_tole
 }

 void verify_reduced_program(const program& p,
@@ -174,13 +177,14 @@ void verify_reduced_program(const program& p,
                            compile_options options,
                            precision quantize,
                            const parameter_map& inputs,
-                            double tolerance)
+                            verify::tolerance tols)
 {
    const auto* mm = p.get_main_module();
    auto n         = std::distance(mm->begin(), mm->end());
    std::cout << "Verify steps: " << n << std::endl;
    for(std::size_t i = 1; i < n; i++)
    {
+<<<<<<< HEAD
        auto last = std::prev(mm->end(), i + 1);
        if(contains({"@literal", "@param"}, last->name()))
        {
@@ -188,6 +192,9 @@ void verify_reduced_program(const program& p,
            continue;
        }
        verify_reduced(p, i, t, options, quantize, inputs, tolerance);
+=======
+        verify_reduced(p, i, t, options, quantize, inputs, tols);
+>>>>>>> origin/thres_tole
    }
 }


--- a/src/driver/verify.hpp
+++ b/src/driver/verify.hpp
@@ -26,6 +26,7 @@

 #include "precision.hpp"
 #include <migraphx/program.hpp>
+#include <migraphx/verify.hpp>

 namespace migraphx {
 namespace driver {
@@ -37,18 +38,18 @@ void verify_program(const std::string& name,
                    compile_options options     = compile_options{},
                    precision quantize          = precision::fp32,
                    const parameter_map& inputs = {},
-                    double tolerance            = 100);
+                    verify::tolerance tols      = verify::tolerance{});
 void verify_instructions(const program& prog,
                         const target& t,
                         compile_options options = compile_options{},
                         precision quantize      = precision::fp32,
-                         double tolerance        = 80);
+                         verify::tolerance tols  = verify::tolerance{});
 void verify_reduced_program(const program& p,
                            const target& t,
                            compile_options options     = compile_options{},
                            precision quantize          = precision::fp32,
                            const parameter_map& inputs = {},
-                            double tolerance            = 80);
+                            verify::tolerance tols      = verify::tolerance{});

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace driver

--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -153,7 +153,7 @@ struct check_shapes
    {
        if(begin != end)
        {
-            if(begin->max_lens().size() != n)
+            if(begin->ndim() != n)
                MIGRAPHX_THROW(prefix() + "Only " + std::to_string(n) + "d supported");
        }
        return *this;
@@ -168,7 +168,7 @@ struct check_shapes
    {
        if(begin != end)
        {
-            if(begin->max_lens().size() > n)
+            if(begin->ndim() > n)
                MIGRAPHX_THROW(prefix() + "Shape must have at most " + std::to_string(n) +
                               " dimensions");
        }
@@ -184,7 +184,7 @@ struct check_shapes
    {
        if(begin != end)
        {
-            if(begin->max_lens().size() < n)
+            if(begin->ndim() < n)
                MIGRAPHX_THROW(prefix() + "Shape must have at least " + std::to_string(n) +
                               " dimensions");
        }
@@ -254,6 +254,16 @@ struct check_shapes
        return *this;
    }

+    /*!
+     * Check all shapes are scalar.
+     */
+    const check_shapes& scalar() const
+    {
+        if(not this->all_of([](const shape& s) { return s.scalar(); }))
+            MIGRAPHX_THROW(prefix() + "Shapes are not a scalar");
+        return *this;
+    }
+
    /*!
     * Check all shapes are standard or scalar.
     */

--- a/src/include/migraphx/op/fill.hpp
+++ b/src/include/migraphx/op/fill.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_FILL_HPP
+#define MIGRAPHX_GUARD_OPERATORS_FILL_HPP
+
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
+#include <migraphx/par_for.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+/**
+ * fill(default_value, output_buffer)
+ * Fill an output buffer with the given default_value.
+ * Note that if the default_value is a literal and the output_buffer
+ * has a static shape this operator can be replaced with a literal.
+ */
+struct fill
+{
+    std::string name() const { return "fill"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(2).same_type();
+        if(inputs.at(0).dynamic() or inputs.at(0).elements() != 1)
+        {
+            MIGRAPHX_THROW("FILL: default_value is dynamic or more than one element");
+        }
+        return inputs.back();
+    }
+
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
+    {
+        visit_all(args[0], args[1])([&](auto value, auto output) {
+            par_for(dyn_out.computed_shape.elements(), [&](auto i) { output[i] = value.front(); });
+        });
+        return args[1];
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 1; }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -55,6 +55,7 @@
 #include <migraphx/op/equal.hpp>
 #include <migraphx/op/erf.hpp>
 #include <migraphx/op/exp.hpp>
+#include <migraphx/op/fill.hpp>
 #include <migraphx/op/flatten.hpp>
 #include <migraphx/op/floor.hpp>
 #include <migraphx/op/fmod.hpp>

--- a/src/include/migraphx/verify.hpp
+++ b/src/include/migraphx/verify.hpp
@@ -29,6 +29,7 @@
 #include <functional>
 #include <iostream>
 #include <numeric>
+#include <assert.h>

 #include <migraphx/float_equal.hpp>
 #include <migraphx/config.hpp>
@@ -187,16 +188,99 @@ double rms_range(const R1& r1, const R2& r2)
        return std::numeric_limits<range_value<R1>>::max();
 }

+template <class R>
+double get_rms_tol(const R&, std::size_t tolerance = 80)
+{
+    double threshold = std::numeric_limits<range_value<R>>::epsilon() * tolerance;
+    return threshold;
+}
+
+/*
+C++ doesn't support named arguments, this is just wrapper that helps distinguish between actual
+results v/s expected results arguments.
+*/
+template <class T>
+struct expected
+{
+    expected() = default;
+    explicit expected(const T& input) : x(&input) {}
+    const T& data() const
+    {
+        assert(x != nullptr);
+        return *x;
+    }
+
+    private:
+    const T* x = nullptr;
+};
+
+// deduction guide for templated expected class
+template <class T>
+expected(const T&) -> expected<T>;
+
+struct tolerance
+{
+    double rms_tol = 0.001;
+    double atol    = 0.001;
+    double rtol    = 0.001;
+};
+
+/*
+MIGraphX implementation of numpy's np.allclose() which checks if elementwise absolute diff is within
+tolerance using this formula:  abs(a - b) < atol + rtol(abs(b))
+*/
 template <class R1, class R2>
-bool verify_range(const R1& r1, const R2& r2, double tolerance = 80, double* out_error = nullptr)
+bool allclose(const R1& r1, const R2& r2, tolerance tols)
 {
-    double threshold = std::numeric_limits<range_value<R1>>::epsilon() * tolerance;
+    std::size_t n = range_distance(r1);
+    if(n == range_distance(r2))
+    {
+        auto idx = mismatch_idx(r1, r2, [&](auto x, auto y) {
+            return abs_diff(double(x), double(y)) > tols.atol + tols.rtol * std::abs(double(y));
+        });
+        return idx >= range_distance(r1);
+    }
+    return false;
+}
+
+template <class R1, class R2>
+bool verify_rms_range(const R1& r1,
+                      const R2& r2,
+                      std::size_t tolerance = 80,
+                      double* out_rms_error = nullptr)
+{
+    double threshold = get_rms_tol(r1, tolerance);
    auto error       = rms_range(r1, r2);
-    if(out_error != nullptr)
-        *out_error = error;
+    if(out_rms_error != nullptr)
+        *out_rms_error = error;
    return error <= threshold;
 }

+template <class R1, class R2>
+bool verify_range_with_tolerance(const R1& r1,
+                                 const expected<R2>& r2,
+                                 tolerance tols        = tolerance{},
+                                 double* out_rms_error = nullptr)
+{
+    auto rms_error = rms_range(r1, r2.data());
+    // disable ewise_verify for now, it requires lot of tests to be fixed
+    // auto ewise_verify = allclose(r1, r2.data(), tols);
+    if(out_rms_error != nullptr)
+        *out_rms_error = rms_error;
+    return rms_error <= tols.rms_tol;
+}
+
+// expected argument should be passed as second, but if it is passed as the first by mistake then
+// flip the order
+template <class R1, class R2>
+bool verify_range_with_tolerance(const expected<R1>& r1,
+                                 const R2& r2,
+                                 tolerance tols        = tolerance{},
+                                 double* out_rms_error = nullptr)
+{
+    return verify_rms_range(r2, r1, tols, out_rms_error);
+}
+
 } // namespace verify
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/verify_args.hpp
+++ b/src/include/migraphx/verify_args.hpp
@@ -31,11 +31,15 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-MIGRAPHX_EXPORT
-bool verify_args(const std::string& name,
-                 const argument& ref_arg,
-                 const argument& target_arg,
-                 double tolerance = 80);
+MIGRAPHX_EXPORT bool verify_args(const std::string& name,
+                                 const argument& target_arg,
+                                 const verify::expected<argument>& ref_arg,
+                                 verify::tolerance);
+
+MIGRAPHX_EXPORT bool verify_args_with_tolerance(const std::string& name,
+                                                const argument& target_arg,
+                                                const verify::expected<argument>& ref_arg,
+                                                std::size_t tolerance = 80);

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/load_save.cpp
+++ b/src/load_save.cpp
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#include <migraphx/instruction.hpp>
 #include <migraphx/load_save.hpp>
 #include <migraphx/file_buffer.hpp>
 #include <migraphx/json.hpp>
@@ -60,9 +61,29 @@ void save(const program& p, const std::string& filename, const file_options& opt
 {
    write_buffer(filename, save_buffer(p, options));
 }
+
+// MIOpen doesn't support serializing fusion plans with Find-2.0 APIs
+void print_miopen_warning(const program& p)
+{
+    auto mods = p.get_modules();
+    if(std::any_of(mods.begin(), mods.end(), [](const auto* m) {
+           return std::any_of(m->begin(), m->end(), [](const instruction& i) {
+               return i.name() == "gpu::miopen_fusion";
+           });
+       }))
+    {
+        std::cout << "[WARNING]: Program has miopen_fusion instructions for which tuned solutions "
+                     "are not stored inside serialized MIGraphX program. Consider serializing with "
+                     "MIGRAPHX_DISABLE_MIOPEN_FUSION=1 flag set."
+                  << std::endl;
+        ;
+    }
+}
+
 std::vector<char> save_buffer(const program& p, const file_options& options)
 {
    value v = p.to_value();
+    print_miopen_warning(p);
    std::vector<char> buffer;
    if(options.format == "msgpack")
    {

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -1325,48 +1325,59 @@ struct find_split_reshape

    void apply(module& m, const match::matcher_result& r) const
    {
-        auto slc = r.instructions["slice"];
-        auto rsp = r.instructions["reshape"];
+        auto slc   = r.instructions["slice"];
+        auto rsp   = r.instructions["reshape"];
+        auto input = slc->inputs().front();
+
+        // Only apply simplification when slices are on a single axis
+        auto axes = any_cast<op::slice>(slc->get_operator()).axes;
+        if(axes.size() > 1)
+        {
+            return;
+        }

-        auto input         = slc->inputs().front();
        auto split_outputs = get_splits(input);
        if(split_outputs.empty())
        {
            return;
        }

-        // Only want to apply this optimization if each split output is followed by
-        // a contiguous op and a reshape
-        if(std::any_of(split_outputs.begin(), split_outputs.end(), [](auto i) {
-               if(i->outputs().size() == 1)
-               {
-                   auto cont = i->outputs().front();
-                   return cont->outputs().size() != 1;
-               }
-               return false;
-           }))
+        // Find all the reshapes (similar to rsp) that can be simplified
+        std::vector<instruction_ref> conts;
+        std::vector<instruction_ref> vec_rsp;
+
+        // Iterate through slice and contiguous outputs to allow simplifications when
+        // slice is followed by multiple reshapes
+        for(auto& i : split_outputs)
        {
-            return;
+            std::copy_if(i->outputs().begin(),
+                         i->outputs().end(),
+                         std::back_inserter(conts),
+                         [](auto j) { return j->name() == "contiguous"; });
        }

-        std::vector<instruction_ref> vec_rsp(split_outputs.size());
-        std::transform(split_outputs.begin(), split_outputs.end(), vec_rsp.begin(), [](auto i) {
-            auto cont = i->outputs().front();
-            return cont->outputs().front();
-        });
+        for(auto& i : conts)
+        {
+            std::copy_if(i->outputs().begin(),
+                         i->outputs().end(),
+                         std::back_inserter(vec_rsp),
+                         [&](auto j) { return j->get_operator() == rsp->get_operator(); });
+        }

-        // all outputs are reshape and of the same shape
-        auto dims = any_cast<op::reshape>(rsp->get_operator()).dims;
-        if(not same_ops(vec_rsp))
+        // No simplification needed if there is only one slice -> cont -> reshape
+        if(vec_rsp.size() <= 1)
        {
            return;
        }

        // ensure reshape happens after the axis dimension
-        auto axis         = any_cast<op::slice>(slc->get_operator()).axes[0];
+        auto axis         = axes[0];
        auto slc_lens     = slc->get_shape().lens();
        auto slc_dim_size = std::accumulate(
            slc_lens.begin() + axis, slc_lens.end(), 1, std::multiplies<std::size_t>());
+        auto input_lens   = input->get_shape().lens();
+        auto input_size   = input->get_shape().elements();
+        auto slc_axis_len = input_lens[axis];

        // search the reshape output (standard shape) to decide which axis are
        // in its output corresponding to the slc_dim_size
@@ -1393,16 +1404,67 @@ struct find_split_reshape
        {
            rsp_axis = std::distance(rsp_strides.begin(), ait);
        }
-        // calculate reshape output shape
-        std::vector<int64_t> vec_dims(vec_rsp.size());

-        std::transform(vec_rsp.begin(), vec_rsp.end(), vec_dims.begin(), [&](auto is) {
-            return is->get_shape().lens()[rsp_axis];
-        });
+        // Calculate reshape output shape
+        // Need to find a reshape such that data represented by instructions in vec_rsp can be
+        // written as slices of this new reshape. This is done by holding all the dims constant in
+        // rsp_lens to compute the required dim for rsp_axis (axis that will be sliced)
+
+        // ex 1:  Input Shape: {2, 12, 4}, Slice Axis: 1, Slices are: (0:4), (4:8), (8:12),
+        //        Reshape Outputs: {2, 2, 2, 4}, {2, 2, 2, 4}, {2, 2, 2, 4}
+        //        rsp_axis = 1, rsp_out_lens (initial) = {2, 1, 2, 4}, rsp_fixed_size = 2*1*2*4 = 16
+        //        rsp_axis_len = 2*12*4 / 16 = 6
+        //        rsp_out_lens (final) = {2, 6, 2, 4}
+
+        // ex 2:  Input Shape: {2, 12, 4}, Slice Axis: 1, Slices are: (0:4), (4:8), (8:12),
+        //        Reshape Outputs: {2, 16}, {2, 16}, {2, 16}
+        //        rsp_axis = 1, rsp_out_lens (initial) = {2, 1}, rsp_fixed_size = 2*1 = 2
+        //        rsp_axis_len = 2*12*4 / 2 = 48
+        //        rsp_out_lens (final) = {2, 48}

        std::vector<int64_t> rsp_out_lens(rsp_lens.begin(), rsp_lens.end());
+        rsp_out_lens[rsp_axis] = 1;
+        auto rsp_fixed_size    = std::accumulate(
+            rsp_out_lens.begin(), rsp_out_lens.end(), 1, std::multiplies<std::size_t>());

-        rsp_out_lens[rsp_axis] = std::accumulate(vec_dims.begin(), vec_dims.end(), std::int64_t{0});
+        // cannot create a valid reshape for simplification
+        if(input_size % rsp_fixed_size != 0)
+        {
+            return;
+        }
+        auto rsp_axis_len      = input_size / rsp_fixed_size;
+        rsp_out_lens[rsp_axis] = rsp_axis_len;
+
+        // Calculate new slice start and end indices. Indices are scaled using the new reshape axis
+        // and the original slice axis. See examples:
+
+        // ex 1:  Input Shape: {2, 12, 4}, Slice Axis: 1, Slices are: (0:4), (4:8), (8:12),
+        //        Reshape Outputs: {2, 2, 2, 4}, {2, 2, 2, 4}, {2, 2, 2, 4}
+        //        slc_axis_len = 12, rsp_axis_len = 6
+        //        New Starts: {0*6/12, 4*6/12,  8*6/12} = {0, 2, 4}
+        //        New Ends:   {4*6/12, 8*6/12, 12*6/12} = {2, 4, 6}
+
+        // ex 2:  Input Shape: {2, 12, 4}, Slice Axis: 1, Slices are: (0:4), (4:8), (8:12),
+        //        Reshape Outputs: {2, 16}, {2, 16}, {2, 16}
+        //        slc_axis_len = 12, rsp_axis_len = 48
+        //        New Starts: {0*48/12, 4*48/12,  8*48/12} = { 0, 16, 32}
+        //        New Ends:   {4*48/12, 8*48/12, 12*48/12} = {16, 32, 48}
+
+        std::vector<int64_t> new_starts(vec_rsp.size());
+        std::transform(vec_rsp.begin(), vec_rsp.end(), new_starts.begin(), [&](auto is) {
+            auto cont   = is->inputs().front();
+            auto og_slc = cont->inputs().front();
+            return any_cast<op::slice>(og_slc->get_operator()).starts[0] * rsp_axis_len /
+                   slc_axis_len;
+        });
+
+        std::vector<int64_t> new_ends(vec_rsp.size());
+        std::transform(vec_rsp.begin(), vec_rsp.end(), new_ends.begin(), [&](auto is) {
+            auto cont   = is->inputs().front();
+            auto og_slc = cont->inputs().front();
+            return any_cast<op::slice>(og_slc->get_operator()).ends[0] * rsp_axis_len /
+                   slc_axis_len;
+        });

        // insert the reshape instruction and add contiguous if needed
        if(not input->get_shape().standard())
@@ -1413,16 +1475,14 @@ struct find_split_reshape
            std::next(input), make_op("reshape", {{"dims", rsp_out_lens}}), input);

        // replace the original reshape with slice
-        int64_t start = 0;
        for(std::size_t i = 0; i < vec_rsp.size(); ++i)
        {
            m.replace_instruction(
                vec_rsp[i],
                make_op(
                    "slice",
-                    {{"axes", {rsp_axis}}, {"starts", {start}}, {"ends", {start + vec_dims[i]}}}),
+                    {{"axes", {rsp_axis}}, {"starts", {new_starts[i]}}, {"ends", {new_ends[i]}}}),
                rsp_ins);
-            start += vec_dims[i];
        }
    }
 };

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -50,6 +50,7 @@ file(GLOB KERNEL_FILES CONFIGURE_DEPENDS
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
 add_embed_library(migraphx_kernels ${KERNEL_FILES} RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/)

+configure_file(device/targets.hpp.in include/migraphx/gpu/device/targets.hpp)
 file(GLOB DEVICE_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
 add_library(migraphx_device ${DEVICE_GPU_SRCS})

@@ -69,6 +70,7 @@ rocm_clang_tidy_check(migraphx_device)
 target_link_libraries(migraphx_device PUBLIC migraphx)
 target_link_libraries(migraphx_device PRIVATE compile_for_gpu)
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINAR_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)
 target_compile_options(migraphx_device PRIVATE -Wno-ignored-attributes)
 migraphx_generate_export_header(migraphx_device DIRECTORY migraphx/gpu/device)

--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -26,7 +26,9 @@

 #include <hip/hip_runtime.h>
 #include <migraphx/config.hpp>
+#include <migraphx/ranges.hpp>
 #include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/targets.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -84,8 +86,15 @@ inline auto launch(hipStream_t stream, index_int global, index_int local)
        hipError_t kernel_launch_status = hipGetLastError();
        if(kernel_launch_status != hipSuccess)
        {
-            MIGRAPHX_THROW("MIGraphX device kernel failed to launch with error: " +
-                           std::string(hipGetErrorString(kernel_launch_status)));
+            std::string message = hipGetErrorString(kernel_launch_status);
+            if(not contains(get_targets(), get_device_name()))
+            {
+                message += ". Trying to run a kernel for " + get_device_name() +
+                           " but MIGraphX was built for targets " + get_targets_as_string() +
+                           ". Please rebuild MIGraphX with -DGPU_TARGETS='" + get_device_name() +
+                           "'.";
+            }
+            MIGRAPHX_THROW("MIGraphX device kernel failed to launch with error: " + message);
        }
    };
 }

--- a/src/targets/gpu/device/targets.cpp
+++ b/src/targets/gpu/device/targets.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/gpu/device/targets.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/errors.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+static std::vector<std::string> parse_targets() { return split_string(MIGRAPHX_GPU_TARGETS, ';'); }
+
+const std::vector<std::string>& get_targets()
+{
+    static auto result = parse_targets();
+    return result;
+}
+
+std::string get_targets_as_string() { return join_strings(get_targets(), ", "); }
+
+static int get_device_id()
+{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("No device");
+    return device;
+}
+
+std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    auto status = hipGetDeviceProperties(&props, get_device_id());
+    if(status != hipSuccess)
+        MIGRAPHX_THROW("Failed to get device properties");
+    return props.gcnArchName;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/targets.hpp.in
+++ b/src/targets/gpu/device/targets.hpp.in
+/*
+* The MIT License (MIT)
+*
+* Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*/
+#ifndef MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
+#define MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
+
+#include <migraphx/config.hpp>
+#include <string>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+#define MIGRAPHX_GPU_TARGETS "@GPU_TARGETS@" // NOLINT
+
+const std::vector<std::string>& get_targets();
+std::string get_targets_as_string();
+
+std::string get_device_name();
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_TARGETS_CPP
+
+
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -103,7 +103,10 @@ struct mlir_op
            }
            if(ins->name() == "@return")
            {
-                return ins_shapes[ins->inputs().at(0)].with_type(type);
+                auto s = ins_shapes[ins->inputs().at(0)].with_type(type);
+                if(not s.standard())
+                    MIGRAPHX_THROW("MLIR doesnt support non-standard output");
+                return s;
            }
            std::vector<shape> input_shapes;
            input_shapes.resize(ins->inputs().size());
@@ -299,10 +302,8 @@ struct find_mlir_fused_ops
    }
 };

-struct find_mlir_standalone_convolution_op
+struct find_mlir_standalone_op
 {
-    auto matcher() const { return match::name("convolution"); }
-
    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
    {
        auto conv_based_op = r.result;
@@ -324,6 +325,16 @@ struct find_mlir_standalone_convolution_op
    }
 };

+struct find_mlir_standalone_convolution_op : find_mlir_standalone_op
+{
+    auto matcher() const { return match::name("convolution"); }
+};
+
+struct find_mlir_standalone_dot_op : find_mlir_standalone_op
+{
+    auto matcher() const { return match::name("dot"); }
+};
+
 /**
 * @brief Declares a new MIGraphX environment variable which forces to generate
 * only specific MLIR operations.
@@ -331,7 +342,7 @@ struct find_mlir_standalone_convolution_op
 * The variable, if defined, forces MIGraphX to use only specific operations
 * with MLIR regardless of the underlying GPU architecture. The variable accepts
 * a list of operations separated by comma. The variable recognizes the following
- * operations: "fused", "convolution". If the variable is not defined MIGraphX
+ * operations: "fused", "convolution", "dot". If the variable is not defined MIGraphX
 * will decide by itself which operations to delegate to MLIR. The variable is
 * intended to be primarily used by rocMLIR developers.
 */
@@ -346,31 +357,33 @@ bool is_requested(std::string_view option)
    return contains(options, option);
 }

-bool is_fusion_enabled()
+bool is_enabled(std::string_view op_name, context* ctx)
 {
    if(is_self_decide())
    {
-        return true;
-    }
-    return is_requested("fused");
-}
-
-bool is_standalone_convs_enabled(context* ctx)
-{
-    if(is_self_decide())
-    {
-        if(ctx == nullptr)
+        if(op_name == "fused")
        {
-            return false;
+            return true;
+        }
+        else if(op_name == "convolution")
+        {
+            if(ctx == nullptr)
+            {
+                return false;
+            }
+            else
+            {
+                const auto& device = ctx->get_current_device();
+                const std::string navi_family{"gfx110"};
+                return starts_with(device.get_gfx_name(), navi_family);
+            }
        }
        else
        {
-            const auto& device = ctx->get_current_device();
-            const std::string navi_family{"gfx110"};
-            return starts_with(device.get_gfx_name(), navi_family);
+            return false;
        }
    }
-    return is_requested("convolution");
+    return is_requested(op_name);
 }
 } // namespace

@@ -379,21 +392,25 @@ bool is_standalone_convs_enabled(context* ctx)
 void fuse_mlir::apply(module_pass_manager& mpm) const
 {
 #ifdef MIGRAPHX_MLIR
-    if(is_fusion_enabled())
+    if(is_enabled("fused", this->ctx))
    {
        match::find_matches(mpm, find_mlir_fused_ops{});
    }

-    if(is_standalone_convs_enabled(this->ctx))
+    if(is_enabled("convolution", this->ctx))
    {
        match::find_matches(mpm, find_mlir_standalone_convolution_op{});
    }
+
+    if(is_enabled("dot", this->ctx))
+    {
+        match::find_matches(mpm, find_mlir_standalone_dot_op{});
+    }
 #else
    (void)mpm;
 #endif
 }

 } // namespace gpu
-
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx