Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_model_test

3a4d36cf · charlie · 6bec381f · e19f78ae · 3a4d36cf · 3a4d36cf
Commit 3a4d36cf authored Sep 30, 2022 by charlie
20 changed files
--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -244,7 +244,6 @@ struct ref_convolution : auto_register_op<ref_convolution<Op>>
            auto weights_lens = args[1].get_shape().lens();
            std::vector<std::size_t> k_lens{weights_lens.begin() + 2, weights_lens.end()};
            padding = calc_dyn_auto_pad(img_lens, k_lens, op.stride, op.dilation);
-            std::cout << "[ ";
            output_shape =
                compute_padded_shape({args.at(0).get_shape(), args.at(1).get_shape()}, padding);
        }

--- a/src/tf/parse_conv.cpp
+++ b/src/tf/parse_conv.cpp
@@ -100,7 +100,7 @@ struct parse_conv : op_parser<parse_conv>
                {
                    MIGRAPHX_THROW("padding should have 4 values");
                }
-                if(padding[0] != padding[2] || padding[1] != padding[3])
+                if(padding[0] != padding[2] or padding[1] != padding[3])
                {
                    MIGRAPHX_THROW("migraphx does not support asymetric padding");
                }

--- a/src/tf/parse_depthwiseconv.cpp
+++ b/src/tf/parse_depthwiseconv.cpp
@@ -90,7 +90,7 @@ struct parse_depthwiseconv : op_parser<parse_depthwiseconv>
                calculate_padding(0, pads, input_dims[2], op.stride[0], op.dilation[0], weight_h);
                calculate_padding(1, pads, input_dims[3], op.stride[1], op.dilation[1], weight_w);

-                if(pads[0] != pads[2] || pads[1] != pads[3])
+                if(pads[0] != pads[2] or pads[1] != pads[3])
                {
                    std::vector<int64_t> padding = {0, 0, pads[0], pads[1], 0, 0, pads[2], pads[3]};
                    l0 = info.add_instruction(migraphx::make_op("pad", {{"pads", padding}}), l0);

--- a/src/tf/parse_pooling.cpp
+++ b/src/tf/parse_pooling.cpp
@@ -42,7 +42,7 @@ struct parse_pooling : op_parser<parse_pooling>
                          tf_parser::node_info info,
                          std::vector<instruction_ref> args) const
    {
-        if(!starts_with(opd.tf_name, "Max") && !starts_with(opd.tf_name, "Av"))
+        if(not starts_with(opd.tf_name, "Max") and not starts_with(opd.tf_name, "Av"))
        {
            MIGRAPHX_THROW("tf pooling mode must be Max or Average");
        }

--- a/src/tf/parse_relu6.cpp
+++ b/src/tf/parse_relu6.cpp
@@ -41,8 +41,9 @@ struct parse_relu6 : op_parser<parse_relu6>
                          const tf_parser::node_info& info,
                          std::vector<instruction_ref> args) const
    {
-        auto min_val = info.add_literal(0.0f);
-        auto max_val = info.add_literal(6.0f);
+        shape::type_t output_type = args[0]->get_shape().type();
+        auto min_val = info.add_literal(migraphx::literal{migraphx::shape{output_type}, {0.0f}});
+        auto max_val = info.add_literal(migraphx::literal{migraphx::shape{output_type}, {6.0f}});

        return info.add_common_op("clip", args[0], min_val, max_val);
    }

--- a/src/tf/tf_parser.cpp
+++ b/src/tf/tf_parser.cpp
@@ -347,7 +347,7 @@ void tf_parser::parse_node(const std::string& name)
                // input was from a node with multiple outputs
                if(contains(input_name, ':'))
                {
-                    input_name = input_name.substr(0, input.find(':'));
+                    input_name.resize(input.find(':'));
                }
                else
                {
@@ -371,7 +371,7 @@ void tf_parser::parse_node(const std::string& name)
        {
            result = ops[node.op()](*this, {get_attributes(node), node.op(), mm}, args);
        }
-        assert(!result.empty());
+        assert(not result.empty());
        // First output has no ":" delimiter
        instructions[name] = result.front();
        for(size_t i = 1; i < result.size(); i++)
@@ -458,7 +458,7 @@ literal tf_parser::parse_tensor(const tensorflow::TensorProto& t) const
 {
    std::vector<size_t> dims = parse_dims(t.tensor_shape());
    size_t shape_size = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<size_t>());
-    if(!t.tensor_content().empty()) // has raw data
+    if(not t.tensor_content().empty()) // has raw data
    {
        const std::string& s = t.tensor_content();
        switch(t.dtype())

--- a/src/tmp_dir.cpp
+++ b/src/tmp_dir.cpp
@@ -78,7 +78,7 @@ void tmp_dir::execute(const std::string& exe, const std::string& args) const

 tmp_dir::~tmp_dir()
 {
-    if(!enabled(MIGRAPHX_DEBUG_SAVE_TEMP_DIR{}))
+    if(not enabled(MIGRAPHX_DEBUG_SAVE_TEMP_DIR{}))
    {
        fs::remove_all(this->path);
    }

--- a/src/value.cpp
+++ b/src/value.cpp
@@ -400,7 +400,7 @@ std::pair<value*, bool> value::insert(const value& v)
 {
    if(v.key.empty())
    {
-        if(!x)
+        if(not x)
            x = std::make_shared<array_value_holder>();
        get_array_impl(x).push_back(v);
        assert(this->if_array());
@@ -408,7 +408,7 @@ std::pair<value*, bool> value::insert(const value& v)
    }
    else
    {
-        if(!x)
+        if(not x)
            x = std::make_shared<object_value_holder>();
        auto p = x->if_object()->emplace(v.key, get_array_impl(x).size());
        if(p.second)
@@ -420,7 +420,7 @@ std::pair<value*, bool> value::insert(const value& v)
 value* value::insert(const value* pos, const value& v)
 {
    assert(v.key.empty());
-    if(!x)
+    if(not x)
        x = std::make_shared<array_value_holder>();
    auto&& a = get_array_impl(x);
    auto it  = a.insert(a.begin() + (pos - begin()), v);
@@ -466,7 +466,7 @@ bool compare(const value& x, const value& y, F f)

 value::type_t value::get_type() const
 {
-    if(!x)
+    if(not x)
        return null_type;
    return x->get_type();
 }
@@ -511,14 +511,7 @@ void print_value(std::ostream& os, const std::vector<value>& x)
    os << "}";
 }

-void print_value(std::ostream& os, const value::binary& x)
-{
-    // Convert binary to integers
-    std::vector<int> v(x.begin(), x.end());
-    os << "{";
-    os << to_string_range(v);
-    os << "}";
-}
+void print_value(std::ostream& os, const value::binary& x) { os << x; }

 std::ostream& operator<<(std::ostream& os, const value& d)
 {

--- a/test/api/test_custom_op_gpu.cpp
+++ b/test/api/test_custom_op_gpu.cpp
@@ -55,7 +55,7 @@ struct simple_custom_op final : migraphx::experimental_custom_op_base

    virtual migraphx::shape compute_shape(migraphx::shapes inputs) const override
    {
-        if(!inputs[0].standard())
+        if(not inputs[0].standard())
        {
            throw std::runtime_error("first arg must be standard shaped");
        }

--- a/test/check_shapes_test.cpp
+++ b/test/check_shapes_test.cpp
@@ -49,6 +49,6 @@ bool create_shapes(bool dynamic_allowed)

 TEST_CASE(allow_dynamic_shape) { EXPECT(create_shapes(true)); }

-TEST_CASE(fail_dynamic_shape) { EXPECT(!create_shapes(false)); }
+TEST_CASE(fail_dynamic_shape) { EXPECT(not create_shapes(false)); }

 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/eval_test.cpp
+++ b/test/eval_test.cpp
@@ -187,7 +187,7 @@ TEST_CASE(print_test)
    std::stringstream ss;
    ss << p;
    std::string s = ss.str();
-    EXPECT(!s.empty());
+    EXPECT(not s.empty());
 }

 TEST_CASE(param_test)

--- a/test/get_target_assignments.cpp
+++ b/test/get_target_assignments.cpp
@@ -26,8 +26,9 @@
 #include <migraphx/make_op.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/ref/target.hpp>
+#include <migraphx/fpga/target.hpp>
 #include <migraphx/target_assignments.hpp>
+#include <migraphx/iterator_for.hpp>

 migraphx::program create_program()
 {
@@ -37,8 +38,8 @@ migraphx::program create_program()
    auto x    = mm->add_parameter("x", s);
    auto y    = mm->add_parameter("y", s);
    auto z    = mm->add_parameter("z", s);
-    auto diff = mm->add_instruction(migraphx::make_op("div"), x, y);
-    mm->add_instruction(migraphx::make_op("div"), diff, z);
+    auto diff = mm->add_instruction(migraphx::make_op("add"), x, y);
+    mm->add_instruction(migraphx::make_op("add"), diff, z);
    return p;
 }

@@ -46,15 +47,17 @@ TEST_CASE(is_supported)
 {
    auto p       = create_program();
    auto targets = migraphx::get_targets();
-    EXPECT(!targets.empty());
-    auto first_target = targets[0];
-    auto t            = migraphx::make_target(first_target);
+    EXPECT(not targets.empty());
+    auto t = migraphx::make_target("fpga");

    const auto assignments = p.get_target_assignments({t});
-    for(const auto& [ins, target] : assignments)
+    const auto* mod        = p.get_main_module();
+    EXPECT(mod->size() == assignments.size());
+
+    for(const auto ins : iterator_for(*mod))
    {
-        (void)ins;
-        EXPECT(target == first_target);
+        const auto& target = assignments.at(ins);
+        EXPECT(target == "fpga");
    }
 }


--- a/test/fuse_pointwise.cpp
+++ b/test/fuse_pointwise.cpp
@@ -21,7 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include "migraphx/dead_code_elimination.hpp"
+#include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/fuse_pointwise.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/pass_manager.hpp>

--- a/test/gpu/adjust_allocation.cpp
+++ b/test/gpu/adjust_allocation.cpp
@@ -40,6 +40,10 @@
 #include <migraphx/make_op.hpp>
 #include <basic_ops.hpp>
 #include <test.hpp>
+#include "make_precompile_op.hpp"
+
+// Treat some operators as compilable to enable lowering
+MIGRAPHX_GPU_TEST_PRECOMPILE("add", "mul", "convert")

 void run_lowering(migraphx::program& p, bool offload_copy = false)
 {
@@ -118,7 +122,7 @@ TEST_CASE(no_copy_dead_param)
        auto xb = mm->add_instruction(migraphx::make_op("hip::allocate", {{"shape", to_value(s)}}));
        auto gx = mm->add_instruction(migraphx::make_op("hip::copy_to_gpu"), x, xb);
        auto ab = mm->add_instruction(migraphx::make_op("hip::allocate", {{"shape", to_value(s)}}));
-        auto sum = mm->add_instruction(migraphx::make_op("gpu::add"), gx, gx, ab);
+        auto sum = mm->add_instruction(make_precompile_op("add"), gx, gx, ab);
        auto r   = mm->add_instruction(migraphx::make_op("hip::copy_from_gpu"), sum);
        mm->add_return({r});


--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -307,12 +307,14 @@ TEST_CASE(compile_math)
        "erf(x)",
        "exp(x)",
        "floor(x)",
+        "fmod(x, x)",
        "isnan(x)",
        "log(x)",
        "max(x, x)",
        "min(x, x)",
        "pow(x, 0)",
        "pow(x, x)",
+        "remainder(x,x)",
        "round(x)",
        "rsqrt(x)",
        "sin(x)",

--- a/src/targets/gpu/device/gelu.cpp
+++ b/src/targets/gpu/device/gelu.cpp
@@ -21,63 +21,46 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/device/gelu.hpp>
-#include <migraphx/gpu/device/nary.hpp>
-#include <migraphx/gpu/device/types.hpp>
-#include <cmath>
+#ifndef MIGRAPHX_GUARD_TEST_GPU_MAKE_PRECOMPILE_OP_HPP
+#define MIGRAPHX_GUARD_TEST_GPU_MAKE_PRECOMPILE_OP_HPP

-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
+#include <migraphx/operation.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>

-// x * 0.5 * (1.0 + erf(x / sqrt(2.0)))
-template <class T>
-auto gelu_fn(T x) __device__
-{
-    return x * 0.5 * (1 + ::erf(x * M_SQRT1_2));
-}
+// NOLINTNEXTLINE
+#define MIGRAPHX_GPU_TEST_PRECOMPILE(...)                                \
+    struct test_compiler : migraphx::gpu::compiler<test_compiler>        \
+    {                                                                    \
+        std::vector<std::string> names() const { return {__VA_ARGS__}; } \
+                                                                         \
+        template <class... Ts>                                           \
+        migraphx::operation compile_op(Ts&&...) const                    \
+        {                                                                \
+            MIGRAPHX_THROW("Not compilable");                            \
+        }                                                                \
+                                                                         \
+        template <class... Ts>                                           \
+        migraphx::gpu::compiler_replace compile(Ts&&...) const           \
+        {                                                                \
+            MIGRAPHX_THROW("Not compilable");                            \
+        }                                                                \
+    };

-// 0.5 * x * (1 + tanh(sqrt(2 / pi) * (x + 0.044715 * pow(x, 3))))
-template <class T>
-auto gelu_fn_new(T x) __device__
+inline migraphx::operation make_precompile_op(migraphx::rank<0>, const migraphx::operation& op)
 {
-    return 0.5 * x * (1 + tanh(sqrt(M_2_PI) * (x + 0.044715 * x * x * x)));
+    return migraphx::make_op("gpu::precompile_op", {{"op", migraphx::to_value(op)}});
 }

-void gelu(hipStream_t stream, const argument& result, const argument& arg)
+inline migraphx::operation make_precompile_op(migraphx::rank<1>, const std::string& name)
 {
-    nary(stream, result, arg)([](auto x) __device__ { return gelu_fn(to_hip_type(x)); });
+    return make_precompile_op(migraphx::rank<0>{}, migraphx::make_op(name));
 }

-void gelu_new(hipStream_t stream, const argument& result, const argument& arg)
-{
-    nary(stream, result, arg)([](auto x) __device__ { return gelu_fn_new(to_hip_type(x)); });
-}
-
-void add_gelu(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2)
-{
-    nary(stream, result, arg1, arg2)([](auto x, auto y) __device__ {
-        auto sum = to_hip_type(x + y);
-        return gelu_fn(sum);
-    });
-}
-
-void add_gelu_new(hipStream_t stream,
-                  const argument& result,
-                  const argument& arg1,
-                  const argument& arg2)
+template <class T>
+auto make_precompile_op(const T& x)
 {
-    nary(stream, result, arg1, arg2)([](auto x, auto y) __device__ {
-        auto sum = to_hip_type(x + y);
-        return gelu_fn(sum);
-    });
+    return make_precompile_op(migraphx::rank<1>{}, x);
 }

-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
+#endif // MIGRAPHX_GUARD_TEST_GPU_MAKE_PRECOMPILE_OP_HPP
--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -37,10 +37,6 @@
 #include <migraphx/functional.hpp>
 #include <test.hpp>

-using migraphx::trim;
-
-// m test_gpu_mlir && ./bin/test_gpu_mlir
-
 struct mlir_gpu_target : migraphx::gpu::target
 {
    std::string name() const { return "mlir"; }
@@ -144,7 +140,7 @@ TEST_CASE(conv)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+  func.func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1], use_dynamic_same_auto_pad = 0 : i64} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    return %0 : tensor<1x2x2x2xf32>
  }
@@ -167,7 +163,7 @@ TEST_CASE(conv_add_relu)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+  func.func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1], use_dynamic_same_auto_pad = 0 : i64} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    %1 = migraphx.add(%0, %arg0) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
    %2 = migraphx.relu(%1) : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>

--- a/test/gpu/pack_int8_args.cpp
+++ b/test/gpu/pack_int8_args.cpp
@@ -21,7 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include "migraphx/instruction_ref.hpp"
+#include <migraphx/instruction_ref.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/target.hpp>
@@ -30,6 +30,7 @@
 #include <migraphx/adjust_allocation.hpp>
 #include <migraphx/gpu/pack_int8_args.hpp>
 #include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/device_name.hpp>
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/replace_allocate.hpp>
@@ -38,10 +39,13 @@
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/make_op.hpp>
 #include <test.hpp>
+#include "make_precompile_op.hpp"

-void run_passes(migraphx::module& m)
+// Treat some operators as compilable to enable lowering
+MIGRAPHX_GPU_TEST_PRECOMPILE("add", "mul", "convert")
+
+void run_passes(migraphx::module& m, migraphx::gpu::context& ctx)
 {
-    auto ctx = migraphx::gpu::context{};
    migraphx::run_passes(m,
                         {migraphx::auto_contiguous{},
                          migraphx::gpu::lowering{&ctx, false},
@@ -52,18 +56,6 @@ void run_passes(migraphx::module& m)
                          migraphx::dead_code_elimination{}});
 }

-bool get_int8_x4_format()
-{
-    bool int8_x4_format = true;
-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
-    auto ctx = migraphx::gpu::context{};
-    rocblas_gemm_flags flag;
-    rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
-    int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
-#endif
-    return int8_x4_format;
-}
-
 TEST_CASE(quant_dot)
 {
    auto create_module = [] {
@@ -102,11 +94,13 @@ TEST_CASE(quant_dot)
                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m2_shape)}}));
            packa = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), l2, alloc);
        }
-        auto gemm =
-            m.add_instruction(migraphx::make_op("gpu::quant_gemm", {{"int8_x4_format", int8_x4}}),
-                              l1,
-                              packa,
-                              gemm_alloc);
+        auto gemm = m.add_instruction(
+            migraphx::make_op("gpu::quant_gemm",
+                              {{"int8_x4_format", int8_x4},
+                               {"compute_fp32", migraphx::gpu::get_compute_fp32_flag()}}),
+            l1,
+            packa,
+            gemm_alloc);

        auto beta_broadcast = m.add_instruction(
            migraphx::make_op("multibroadcast", {{"out_lens", m3_shape.lens()}}), beta);
@@ -116,19 +110,19 @@ TEST_CASE(quant_dot)
            m.add_instruction(migraphx::make_op("gpu::contiguous"), beta_broadcast, beta_alloc);
        auto mul_alloc = m.add_instruction(
            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(m3_shape)}}));
-        auto m3_beta =
-            m.add_instruction(migraphx::make_op("gpu::mul"), l3, beta_contiguous, mul_alloc);
-        auto gemm_add = m.add_instruction(migraphx::make_op("gpu::add"), gemm, m3_beta, output);
+        auto m3_beta = m.add_instruction(make_precompile_op("mul"), l3, beta_contiguous, mul_alloc);
+        auto gemm_add = m.add_instruction(make_precompile_op("add"), gemm, m3_beta, output);
        m.add_return({gemm_add});

        return m;
    };

-    auto m1 = create_module();
-    run_passes(m1);
+    auto m1  = create_module();
+    auto ctx = migraphx::gpu::context{};
+    run_passes(m1, ctx);

-    bool flag = get_int8_x4_format();
-    auto m2   = create_optimized_int8_x4(flag);
+    bool int8_x4 = migraphx::gpu::get_int8_x4_format(ctx);
+    auto m2      = create_optimized_int8_x4(int8_x4);
    EXPECT(m1 == m2);
 }

@@ -187,21 +181,23 @@ TEST_CASE(quant_dot_trans)
        // back result to int8
        auto tl1_convert_alloc = m.add_instruction(migraphx::make_op(
            "hip::allocate", {{"shape", migraphx::to_value(alpha_contiguous->get_shape())}}));
-        auto tl1_convert       = m.add_instruction(
-            migraphx::make_op("gpu::convert", {{"target_type", alpha->get_shape().type()}}),
-            conta,
-            tl1_convert_alloc);
-        auto mul_alloc       = m.add_instruction(migraphx::make_op(
+        auto tl1_convert =
+            m.add_instruction(make_precompile_op(migraphx::make_op(
+                                  "convert", {{"target_type", alpha->get_shape().type()}})),
+                              conta,
+                              tl1_convert_alloc);
+        auto mul_alloc = m.add_instruction(migraphx::make_op(
            "hip::allocate", {{"shape", migraphx::to_value(tl1_convert->get_shape())}}));
-        auto tl1_alpha_int32 = m.add_instruction(
-            migraphx::make_op("gpu::mul"), alpha_contiguous, tl1_convert, mul_alloc);
+        auto tl1_alpha_int32 =
+            m.add_instruction(make_precompile_op("mul"), alpha_contiguous, tl1_convert, mul_alloc);
        // convert mul_res to int8
        auto tl1_alpha_int8_alloc = m.add_instruction(migraphx::make_op(
            "hip::allocate", {{"shape", migraphx::to_value(conta->get_shape())}}));
-        auto tl1_alpha_int8       = m.add_instruction(
-            migraphx::make_op("gpu::convert", {{"target_type", conta->get_shape().type()}}),
-            tl1_alpha_int32,
-            tl1_alpha_int8_alloc);
+        auto tl1_alpha_int8 =
+            m.add_instruction(make_precompile_op(migraphx::make_op(
+                                  "convert", {{"target_type", conta->get_shape().type()}})),
+                              tl1_alpha_int32,
+                              tl1_alpha_int8_alloc);

        auto packb = contb;
        if(int8_x4)
@@ -211,21 +207,24 @@ TEST_CASE(quant_dot_trans)
            packb = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), contb, allocpb);
        }

-        auto gemm =
-            m.add_instruction(migraphx::make_op("gpu::quant_gemm", {{"int8_x4_format", int8_x4}}),
-                              tl1_alpha_int8,
-                              packb,
-                              output);
+        auto gemm = m.add_instruction(
+            migraphx::make_op("gpu::quant_gemm",
+                              {{"int8_x4_format", int8_x4},
+                               {"compute_fp32", migraphx::gpu::get_compute_fp32_flag()}}),
+            tl1_alpha_int8,
+            packb,
+            output);
        m.add_return({gemm});

        return m;
    };

-    auto m1   = create_module();
-    bool flag = get_int8_x4_format();
-    auto m2   = create_optimized_int8_x4(flag);
+    auto m1  = create_module();
+    auto ctx = migraphx::gpu::context{};
+    run_passes(m1, ctx);

-    run_passes(m1);
+    bool int8_x4 = migraphx::gpu::get_int8_x4_format(ctx);
+    auto m2      = create_optimized_int8_x4(int8_x4);

    EXPECT(m1 == m2);
 }
@@ -292,11 +291,13 @@ TEST_CASE(quant_dot_pad)
            packa = m.add_instruction(migraphx::make_op("gpu::int8_gemm_pack_a"), pl2, alloc);
        }

-        auto gemm =
-            m.add_instruction(migraphx::make_op("gpu::quant_gemm", {{"int8_x4_format", int8_x4}}),
-                              pl1,
-                              packa,
-                              gemm_alloc);
+        auto gemm = m.add_instruction(
+            migraphx::make_op("gpu::quant_gemm",
+                              {{"int8_x4_format", int8_x4},
+                               {"compute_fp32", migraphx::gpu::get_compute_fp32_flag()}}),
+            pl1,
+            packa,
+            gemm_alloc);

        auto beta_broadcast =
            m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s3.lens()}}), beta);
@@ -306,18 +307,18 @@ TEST_CASE(quant_dot_pad)
            m.add_instruction(migraphx::make_op("gpu::contiguous"), beta_broadcast, beta_alloc);
        auto mul_alloc = m.add_instruction(
            migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(s3)}}));
-        auto m3_beta =
-            m.add_instruction(migraphx::make_op("gpu::mul"), l3, beta_contiguous, mul_alloc);
-        auto gemm_add = m.add_instruction(migraphx::make_op("gpu::add"), gemm, m3_beta, output);
+        auto m3_beta = m.add_instruction(make_precompile_op("mul"), l3, beta_contiguous, mul_alloc);
+        auto gemm_add = m.add_instruction(make_precompile_op("add"), gemm, m3_beta, output);
        m.add_return({gemm_add});
        return m;
    };

-    auto m1   = create_module();
-    bool flag = get_int8_x4_format();
-    auto m2   = create_optimized_int8_x4(flag);
+    auto m1  = create_module();
+    auto ctx = migraphx::gpu::context{};
+    run_passes(m1, ctx);

-    run_passes(m1);
+    bool int8_x4 = migraphx::gpu::get_int8_x4_format(ctx);
+    auto m2      = create_optimized_int8_x4(int8_x4);

    EXPECT(m1 == m2);
 }
@@ -396,14 +397,15 @@ TEST_CASE(quant_dot_trans_pad)
        // back result to int8
        auto tl1_convert_alloc = m.add_instruction(migraphx::make_op(
            "hip::allocate", {{"shape", migraphx::to_value(alpha_contiguous->get_shape())}}));
-        auto tl1_convert       = m.add_instruction(
-            migraphx::make_op("gpu::convert", {{"target_type", alpha->get_shape().type()}}),
-            conta,
-            tl1_convert_alloc);
-        auto mul_alloc       = m.add_instruction(migraphx::make_op(
+        auto tl1_convert =
+            m.add_instruction(make_precompile_op(migraphx::make_op(
+                                  "convert", {{"target_type", alpha->get_shape().type()}})),
+                              conta,
+                              tl1_convert_alloc);
+        auto mul_alloc = m.add_instruction(migraphx::make_op(
            "hip::allocate", {{"shape", migraphx::to_value(tl1_convert->get_shape())}}));
-        auto tl1_alpha_int32 = m.add_instruction(
-            migraphx::make_op("gpu::mul"), alpha_contiguous, tl1_convert, mul_alloc);
+        auto tl1_alpha_int32 =
+            m.add_instruction(make_precompile_op("mul"), alpha_contiguous, tl1_convert, mul_alloc);
        // convert mul_res to int8
        auto tl1_alpha_int8_alloc = m.add_instruction(migraphx::make_op(
            "hip::allocate", {{"shape", migraphx::to_value(conta->get_shape())}}));
@@ -415,10 +417,11 @@ TEST_CASE(quant_dot_trans_pad)
                migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ps1)}}));
        }

-        auto tl1_alpha_int8 = m.add_instruction(
-            migraphx::make_op("gpu::convert", {{"target_type", conta->get_shape().type()}}),
-            tl1_alpha_int32,
-            tl1_alpha_int8_alloc);
+        auto tl1_alpha_int8 =
+            m.add_instruction(make_precompile_op(migraphx::make_op(
+                                  "convert", {{"target_type", conta->get_shape().type()}})),
+                              tl1_alpha_int32,
+                              tl1_alpha_int8_alloc);

        auto pa = tl1_alpha_int8;
        if(int8_x4)
@@ -438,17 +441,23 @@ TEST_CASE(quant_dot_trans_pad)
        }

        auto gemm = m.add_instruction(
-            migraphx::make_op("gpu::quant_gemm", {{"int8_x4_format", int8_x4}}), pa, packb, output);
+            migraphx::make_op("gpu::quant_gemm",
+                              {{"int8_x4_format", int8_x4},
+                               {"compute_fp32", migraphx::gpu::get_compute_fp32_flag()}}),
+            pa,
+            packb,
+            output);
        m.add_return({gemm});

        return m;
    };

-    auto m1   = create_module();
-    bool flag = get_int8_x4_format();
-    auto m2   = create_optimized_int8_x4(flag);
+    auto m1  = create_module();
+    auto ctx = migraphx::gpu::context{};
+    run_passes(m1, ctx);

-    run_passes(m1);
+    bool int8_x4 = migraphx::gpu::get_int8_x4_format(ctx);
+    auto m2      = create_optimized_int8_x4(int8_x4);

    EXPECT(m1 == m2);
 }

--- a/test/include/basic_ops.hpp
+++ b/test/include/basic_ops.hpp
@@ -112,12 +112,12 @@ struct mod_pass_op
    migraphx::shape compute_shape(std::vector<migraphx::shape> inputs,
                                  std::vector<migraphx::module_ref> mods) const
    {
-        if(!mods.empty())
+        if(not mods.empty())
        {
            auto out_shapes = mods[0]->get_output_shapes();
            return out_shapes[0];
        }
-        if(!inputs.empty())
+        if(not inputs.empty())
        {
            return inputs.front();
        }
@@ -186,9 +186,10 @@ struct nop
    migraphx::shape compute_shape(const std::vector<migraphx::shape>&) const { return {}; }
 };

-inline migraphx::literal get_2x2()
+inline migraphx::literal get_2x2(int base = 0)
 {
-    return migraphx::literal{{migraphx::shape::float_type, {2, 2}}, {1, 2, 3, 4}};
+    return migraphx::literal{{migraphx::shape::float_type, {2, 2}},
+                             {base + 1, base + 2, base + 3, base + 4}};
 }

 inline migraphx::literal get_2x2_transposed()

--- a/test/include/test.hpp
+++ b/test/include/test.hpp
@@ -108,15 +108,7 @@ struct function
 };

 template <class Stream, class Iterator>
-inline Stream& stream_range(Stream& s, Iterator start, Iterator last)
-{
-    if(start != last)
-    {
-        s << *start;
-        std::for_each(std::next(start), last, [&](auto&& x) { s << ", " << x; });
-    }
-    return s;
-}
+Stream& stream_range(Stream& s, Iterator start, Iterator last);

 template <class Stream>
 inline Stream& operator<<(Stream& s, std::nullptr_t)
@@ -136,6 +128,17 @@ inline auto operator<<(Stream& s, const Range& v) -> decltype(stream_range(s, v.
    return s;
 }

+template <class Stream, class Iterator>
+inline Stream& stream_range(Stream& s, Iterator start, Iterator last)
+{
+    if(start != last)
+    {
+        s << *start;
+        std::for_each(std::next(start), last, [&](auto&& x) { s << ", " << x; });
+    }
+    return s;
+}
+
 template <class T>
 const T& get_value(const T& x)
 {
@@ -342,7 +345,7 @@ inline std::ostream& operator<<(std::ostream& os, const color& c)
 template <class T, class F>
 void failed(T x, const char* msg, const char* func, const char* file, int line, F f)
 {
-    if(!bool(x.value()))
+    if(not bool(x.value()))
    {
        std::cout << func << std::endl;
        std::cout << file << ":" << line << ":" << std::endl;