Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_nms

eb4dfb49 · charlie · 20394c35 · 2783c649 · eb4dfb49 · eb4dfb49
Commit eb4dfb49 authored Jun 30, 2022 by charlie
20 changed files
--- a/examples/migraphx/custom_op_hip_kernel/CMakeLists.txt
+++ b/examples/migraphx/custom_op_hip_kernel/CMakeLists.txt
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+cmake_minimum_required(VERSION 3.5)
+project (custom_hip_kernel)
+set (CMAKE_CXX_STANDARD 14)
+set (EXAMPLE custom_op_hip_kernel)
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+find_package (migraphx REQUIRED)
+find_package (hip REQUIRED)
+message("source file: " ${EXAMPLE}.cpp " ---> bin: " ${EXAMPLE})
+add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+target_link_libraries(${EXAMPLE} migraphx::c hip::device)
--- a/examples/migraphx/custom_op_hip_kernel/README.md
+++ b/examples/migraphx/custom_op_hip_kernel/README.md
+# Custom Kernel using MIGraphX API. 
+This is an example of a custom operator implementation using MIGraphX's C/C++ APIs. It also demonstrates how to use this custom op in conjunction with rest of MIGraphX operators to build  and run MIGraphX program on GPU. 
+Kernels can be written in either HIP, MIOpen, or by using RocBLAS library. This particular example uses **HIP**.
+ To build the example, ensure ROCm is installed at `/opt/rocm`. 
+ 1.  `export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH`
+ 2.  `cd $MIGRAPHX_SRC/examples/migraphx/custom_op_hip_kernel/`
+ 3.  `mkdir build && cd build`
+ 4.  `CXX=/opt/rocm/llvm/bin/clang++ cmake ..  && make`
+ 5.  `./custom_op_hip_kernel`
\ No newline at end of file
--- a/examples/migraphx/custom_op_hip_kernel/custom_op_hip_kernel.cpp
+++ b/examples/migraphx/custom_op_hip_kernel/custom_op_hip_kernel.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <algorithm>
+#include <hip/hip_runtime.h>
+#include <migraphx/migraphx.hpp> // MIGraphX's C++ API
+#include <numeric>
+#define MIGRAPHX_HIP_ASSERT(x) (assert((x) == hipSuccess))
+/*
+ * Square each element in the array A and write to array C.
+ */
+template <typename T>
+__global__ void vector_square(T* C_d, const T* A_d, size_t N)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x;
+    for(size_t i = offset; i < N; i += stride)
+    {
+        C_d[i] = A_d[i] * A_d[i];
+    }
+}
+struct square_custom_op final : migraphx::experimental_custom_op_base
+{
+    virtual std::string name() const override { return "square_custom_op"; }
+    virtual migraphx::argument
+    compute(migraphx::context ctx, migraphx::shape, migraphx::arguments inputs) const override
+    {
+        // if compile options has offload_copy = true then, parameters and outputs will be
+        // automatically copied to and from GPUs' memory. Here assume that `inputs` arguments are
+        // already in the GPU, so no need to do Malloc, Free or Memcpy. Last element in the `inputs`
+        // is output argument, so it should be returned from compute method.
+        auto* input_buffer  = reinterpret_cast<float*>(inputs[0].data());
+        auto* output_buffer = reinterpret_cast<float*>(inputs[1].data());
+        size_t n_elements   = inputs[0].get_shape().bytes() / sizeof(inputs[0].get_shape().type());
+        MIGRAPHX_HIP_ASSERT(hipSetDevice(0));
+        const unsigned blocks            = 512;
+        const unsigned threads_per_block = 256;
+        // cppcheck-suppress UseDeviceLaunch
+        hipLaunchKernelGGL(vector_square,
+                           dim3(blocks),
+                           dim3(threads_per_block),
+                           0,
+                           ctx.get_queue<hipStream_t>(),
+                           output_buffer,
+                           input_buffer,
+                           n_elements);
+        return inputs[1];
+    }
+    virtual migraphx::shape compute_shape(migraphx::shapes inputs) const override
+    {
+        if(inputs.size() != 2)
+        {
+            throw std::runtime_error("square_custom_op must have 2 arguments");
+        }
+        if(inputs[0] != inputs[1])
+        {
+            throw std::runtime_error("Inputs to the square_custom_op must have same Shape");
+        }
+        return inputs.back();
+    }
+};
+int main(int argc, const char* argv[])
+{
+    square_custom_op square_op;
+    migraphx::register_experimental_custom_op(square_op);
+    migraphx::program p;
+    migraphx::shape s{migraphx_shape_float_type, {32, 256}};
+    migraphx::module m = p.get_main_module();
+    auto x             = m.add_parameter("x", s);
+    auto neg_ins       = m.add_instruction(migraphx::operation("neg"), x);
+    // add allocation for the custom_kernel's output buffer
+    auto alloc = m.add_allocation(s);
+    auto custom_kernel =
+        m.add_instruction(migraphx::operation("square_custom_op"), {neg_ins, alloc});
+    auto relu_ins = m.add_instruction(migraphx::operation("relu"), {custom_kernel});
+    m.add_return({relu_ins});
+    migraphx::compile_options options;
+    // set offload copy to true for GPUs
+    options.set_offload_copy();
+    p.compile(migraphx::target("gpu"), options);
+    migraphx::program_parameters pp;
+    std::vector<float> x_data(s.bytes() / sizeof(s.type()));
+    std::iota(x_data.begin(), x_data.end(), 0);
+    pp.add("x", migraphx::argument(s, x_data.data()));
+    auto results                       = p.eval(pp);
+    auto result                        = results[0];
+    std::vector<float> expected_result = x_data;
+    std::transform(expected_result.begin(),
+                   expected_result.end(),
+                   expected_result.begin(),
+                   [](auto i) { return std::pow(i, 2); });
+    if(bool{result == migraphx::argument(s, expected_result.data())})
+    {
+        std::cout << "Successfully executed custom HIP kernel example\n";
+    }
+    else
+    {
+        std::cout << "Custom HIP kernel example failed\n";
+    }
+    return 0;
+}
--- a/examples/migraphx/custom_op_miopen_kernel/CMakeLists.txt
+++ b/examples/migraphx/custom_op_miopen_kernel/CMakeLists.txt
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+cmake_minimum_required(VERSION 3.5)
+project (custom_miopen_kernel)
+set (CMAKE_CXX_STANDARD 14)
+set (EXAMPLE custom_op_miopen_kernel)
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+find_package (migraphx REQUIRED)
+find_package (miopen REQUIRED)
+message("source file: " ${EXAMPLE}.cpp " ---> bin: " ${EXAMPLE})
+add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+target_link_libraries(${EXAMPLE} migraphx::c MIOpen)
--- a/examples/migraphx/custom_op_miopen_kernel/README.md
+++ b/examples/migraphx/custom_op_miopen_kernel/README.md
+# Custom MIOpen Kernel using MIGraphX API. 
+ This is an example of a custom operator implementation using MIGraphX's C/C++ APIs. It also demonstrates how to use this custom op in conjunction with rest of MIGraphX operators to build  and run MIGraphX program on GPU. 
+ Kernels can be written in either HIP, MIOpen, or by using RocBLAS library.  This particular example uses **MIOpen** library calls.
+ To build and run example, ensure ROCm is installed at `/opt/rocm`. 
+ 1.  `export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH`
+ 2.  `cd $MIGRAPHX_SRC/examples/migraphx/custom_op_miopen_kernel/`
+ 3.  `mkdir build && cd build`
+ 4.  `cmake ..  && make`
+ 5.  `./custom_op_miopen_kernel`
--- a/examples/migraphx/custom_op_miopen_kernel/custom_op_miopen_kernel.cpp
+++ b/examples/migraphx/custom_op_miopen_kernel/custom_op_miopen_kernel.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <algorithm>
+#include <hip/hip_runtime.h>
+#include <migraphx/migraphx.h>
+#include <miopen/miopen.h>
+#include <migraphx/migraphx.hpp> // MIGraphX's C++ API
+#include <numeric>
+#include <stdexcept>
+#define MIGRAPHX_MIOPEN_ASSERT(x) (assert((x) == miopenStatusSuccess))
+#define MIGRAPHX_HIP_ASSERT(x) (assert((x) == hipSuccess))
+inline miopenTensorDescriptor_t make_miopen_tensor(const migraphx::shape& s, bool pack = false)
+{
+    miopenTensorDescriptor_t t;
+    MIGRAPHX_MIOPEN_ASSERT(miopenCreateTensorDescriptor(&t));
+    // Convert to ints
+    auto s_lens = s.lengths();
+    std::vector<int> lens(s_lens.begin(), s_lens.end());
+    auto s_strides = s.strides();
+    std::vector<int> strides(s_strides.begin(), s_strides.end());
+    miopenDataType_t d;
+    if(s.type() == migraphx_shape_float_type)
+        d = miopenFloat;
+    else if(s.type() == migraphx_shape_half_type)
+        d = miopenHalf;
+    else if(s.type() == migraphx_shape_int32_type)
+        d = miopenInt32;
+    else if(s.type() == migraphx_shape_int8_type)
+    {
+        if(pack)
+        {
+            // update the lens and corresponding strides
+            d          = miopenInt8x4;
+            lens[1]    = ((lens[1] + 3) / 4) * 4;
+            strides[0] = strides[1] * lens[1];
+        }
+        else
+        {
+            d = miopenInt8;
+        }
+    }
+    else
+    {
+        throw("MAKE_TENSOR: unsupported type");
+    }
+    miopenSetTensorDescriptor(t, d, s_lens.size(), lens.data(), strides.data());
+    return t;
+}
+inline auto make_miopen_handle(migraphx::context& ctx)
+{
+    MIGRAPHX_HIP_ASSERT(hipSetDevice(0));
+    auto* stream = ctx.get_queue<hipStream_t>();
+    miopenHandle_t out;
+    MIGRAPHX_MIOPEN_ASSERT(miopenCreateWithStream(&out, stream));
+    return out;
+}
+inline auto make_activation_descriptor(miopenActivationMode_t mode,
+                                       double alpha = 0,
+                                       double beta  = 0,
+                                       double gamma = 0)
+{
+    miopenActivationDescriptor_t ad;
+    MIGRAPHX_MIOPEN_ASSERT(miopenCreateActivationDescriptor(&ad));
+    miopenSetActivationDescriptor(ad, mode, alpha, beta, gamma);
+    return ad;
+}
+struct abs_custom_op final : migraphx::experimental_custom_op_base
+{
+    virtual std::string name() const override { return "abs_custom_op"; }
+    virtual migraphx::argument compute(migraphx::context ctx,
+                                       migraphx::shape output_shape,
+                                       migraphx::arguments args) const override
+    {
+        float alpha = 1;
+        float beta  = 0;
+        // MIOpen kernel call takes raw buffer pointers for the TensorData. These Buffer pointers
+        // must be accompanied with Tensor Description e.g. shape, type, strides, dimensionality.
+        // Following `make_miopen_tensor` makes such tensor descriptors to pass as parameter to
+        // MIOpen kernel call.
+        auto y_desc = make_miopen_tensor(output_shape);
+        auto x_desc = make_miopen_tensor(args[0].get_shape());
+        // create MIOpen stream handle
+        auto miopen_handle = make_miopen_handle(ctx);
+        // MIOpen has generic kernel for many different kinds of activation functions.
+        // Each such generic call must be accompanied with description of what kind of activation
+        // computation to perform
+        auto ad = make_activation_descriptor(miopenActivationABS, 0, 0, 0);
+        miopenActivationForward(
+            miopen_handle, ad, &alpha, x_desc, args[0].data(), &beta, y_desc, args[1].data());
+        return args[1];
+    }
+    virtual migraphx::shape compute_shape(migraphx::shapes inputs) const override
+    {
+        if(inputs.size() != 2)
+        {
+            throw std::runtime_error("abs_custom_op must have two input arguments");
+        }
+        if(inputs[0] != inputs[1])
+        {
+            throw std::runtime_error("Input arguments to abs_custom_op must have same shape");
+        }
+        return inputs.back();
+    }
+};
+int main(int argc, const char* argv[])
+{
+    abs_custom_op abs_op;
+    migraphx::register_experimental_custom_op(abs_op);
+    migraphx::program p;
+    migraphx::shape s{migraphx_shape_float_type, {32, 256}};
+    migraphx::module m = p.get_main_module();
+    auto x             = m.add_parameter("x", s);
+    auto neg_ins       = m.add_instruction(migraphx::operation("neg"), {x});
+    // add allocation for the custom_kernel's output buffer
+    auto alloc         = m.add_allocation(s);
+    auto custom_kernel = m.add_instruction(migraphx::operation("abs_custom_op"), {neg_ins, alloc});
+    auto relu_ins      = m.add_instruction(migraphx::operation("relu"), {custom_kernel});
+    m.add_return({relu_ins});
+    migraphx::compile_options options;
+    // set offload copy to true for GPUs
+    options.set_offload_copy();
+    p.compile(migraphx::target("gpu"), options);
+    migraphx::program_parameters prog_params;
+    std::vector<float> x_data(s.bytes() / sizeof(s.type()));
+    std::iota(x_data.begin(), x_data.end(), 0);
+    prog_params.add("x", migraphx::argument(s, x_data.data()));
+    auto results                       = p.eval(prog_params);
+    auto result                        = results[0];
+    std::vector<float> expected_result = x_data;
+    std::transform(expected_result.begin(),
+                   expected_result.end(),
+                   expected_result.begin(),
+                   [](auto i) { return std::abs(i); });
+    if(bool{result == migraphx::argument(s, expected_result.data())})
+    {
+        std::cout << "Successfully executed custom MIOpen kernel example with MIGraphX\n";
+    }
+    else
+    {
+        std::cout << "Custom MIOpen kernel example failed\n";
+    }
+    return 0;
+}
--- a/examples/migraphx/custom_op_rocblas_kernel/CMakeLists.txt
+++ b/examples/migraphx/custom_op_rocblas_kernel/CMakeLists.txt
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+cmake_minimum_required(VERSION 3.5)
+project (custom_rocblas_kernel)
+set (CMAKE_CXX_STANDARD 14)
+set (EXAMPLE custom_op_rocblas_kernel)
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+find_package (migraphx REQUIRED)
+find_package (rocblas REQUIRED)
+message("source file: " ${EXAMPLE}.cpp " ---> bin: " ${EXAMPLE})
+add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+target_link_libraries(${EXAMPLE} migraphx::c roc::rocblas) 
--- a/examples/migraphx/custom_op_rocblas_kernel/README.md
+++ b/examples/migraphx/custom_op_rocblas_kernel/README.md
+# Custom rocBLAS Kernel using MIGraphX API. 
+ This is an example of a custom operator implementation using MIGraphX's C/C++ APIs. It also demonstrates how to use this custom op in conjunction with rest of MIGraphX operators to build  and run MIGraphX program on GPU. 
+ Kernels can be written in either HIP, MIOpen, or by using RocBLAS library.  This particular example uses **rocBLAS** library calls.
+ To build and run the example, ensure ROCm is installed at `/opt/rocm`. 
+ 1.  `export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH`
+ 2.  `cd $MIGRAPHX_SRC/examples/migraphx/custom_op_rocblas_kernel/`
+ 3.  `mkdir build && cd build`
+ 4.  `cmake ..  && make`
+ 5.  `./custom_op_rocblas_kernel`
--- a/examples/migraphx/custom_op_rocblas_kernel/custom_op_rocblas_kernel.cpp
+++ b/examples/migraphx/custom_op_rocblas_kernel/custom_op_rocblas_kernel.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <algorithm>
+#include <hip/hip_runtime.h>
+#include <rocblas.h>
+#include <migraphx/migraphx.h>
+#include <migraphx/migraphx.hpp> // MIGraphX's C++ API
+#include <numeric>
+#include <stdexcept>
+#define MIGRAPHX_ROCBLAS_ASSERT(x) (assert((x) == rocblas_status::rocblas_status_success))
+#define MIGRAPHX_HIP_ASSERT(x) (assert((x) == hipSuccess))
+rocblas_handle create_rocblas_handle_ptr()
+{
+    rocblas_handle handle;
+    MIGRAPHX_ROCBLAS_ASSERT(rocblas_create_handle(&handle));
+    return rocblas_handle{handle};
+}
+rocblas_handle create_rocblas_handle_ptr(migraphx::context& ctx)
+{
+    MIGRAPHX_HIP_ASSERT(hipSetDevice(0));
+    rocblas_handle rb = create_rocblas_handle_ptr();
+    auto* stream      = ctx.get_queue<hipStream_t>();
+    MIGRAPHX_ROCBLAS_ASSERT(rocblas_set_stream(rb, stream));
+    return rb;
+}
+struct sscal_custom_op final : migraphx::experimental_custom_op_base
+{
+    virtual std::string name() const override { return "sscal_custom_op"; }
+    virtual migraphx::argument compute(migraphx::context ctx,
+                                       migraphx::shape output_shape,
+                                       migraphx::arguments args) const override
+    {
+        // create rocblas stream handle
+        auto rocblas_handle = create_rocblas_handle_ptr(ctx);
+        rocblas_int n       = args[1].get_shape().lengths()[0];
+        float* alpha        = reinterpret_cast<float*>(args[0].data());
+        float* vec_ptr      = reinterpret_cast<float*>(args[1].data());
+        MIGRAPHX_ROCBLAS_ASSERT(rocblas_sscal(rocblas_handle, n, alpha, vec_ptr, 1));
+        return args[1];
+    }
+    virtual migraphx::shape compute_shape(migraphx::shapes inputs) const override
+    {
+        if(inputs.size() != 2)
+        {
+            throw std::runtime_error("sscal_custom_op must have 2 input arguments");
+        }
+        if(inputs[0].lengths().size() != 1 || inputs[0].lengths()[0] != 1)
+        {
+            throw std::runtime_error("first input argument to sscal_custom_op must be a scalar");
+        }
+        if(inputs[1].lengths().size() != 1)
+        {
+            throw std::runtime_error(
+                "second input argument to sscal_custom_op must be a vector with dimension one");
+        }
+        return inputs.back();
+    }
+};
+int main(int argc, const char* argv[])
+{
+    // computes ReLU(neg(x) * scale)
+    sscal_custom_op sscal_op;
+    migraphx::register_experimental_custom_op(sscal_op);
+    migraphx::program p;
+    migraphx::shape x_shape{migraphx_shape_float_type, {8192}};
+    migraphx::shape scale_shape{migraphx_shape_float_type, {1}};
+    migraphx::module m = p.get_main_module();
+    auto x             = m.add_parameter("x", x_shape);
+    auto scale         = m.add_parameter("scale", scale_shape);
+    auto neg_ins       = m.add_instruction(migraphx::operation("neg"), {x});
+    auto custom_kernel =
+        m.add_instruction(migraphx::operation("sscal_custom_op"), {scale, neg_ins});
+    auto relu_ins = m.add_instruction(migraphx::operation("relu"), {custom_kernel});
+    m.add_return({relu_ins});
+    migraphx::compile_options options;
+    // set offload copy to true for GPUs
+    options.set_offload_copy();
+    p.compile(migraphx::target("gpu"), options);
+    migraphx::program_parameters pp;
+    std::vector<float> x_data(x_shape.bytes() / sizeof(x_shape.type()));
+    std::vector<float> scale_data{-1};
+    std::iota(x_data.begin(), x_data.end(), 0);
+    pp.add("x", migraphx::argument(x_shape, x_data.data()));
+    pp.add("scale", migraphx::argument(scale_shape, scale_data.data()));
+    auto results                       = p.eval(pp);
+    auto result                        = results[0];
+    std::vector<float> expected_result = x_data;
+    if(bool{result == migraphx::argument(x_shape, expected_result.data())})
+    {
+        std::cout << "Successfully executed custom rocBLAS kernel example\n";
+    }
+    else
+    {
+        std::cout << "Custom rocBLAS kernel example failed\n";
+    }
+    return 0;
+}
--- a/examples/vision/python_yolov4/yolov4_inference.ipynb
+++ b/examples/vision/python_yolov4/yolov4_inference.ipynb
@@ -80,7 +80,7 @@
   "outputs": [],
   "source": [
    "if not os.path.exists(\"yolov4_fp16.mxr\"):\n",
-    "    !/opt/rocm/bin/migraphx-driver compile ./utilities/yolov4.onnx --gpu --enable-offload-copy --fp16ref --binary -o yolov4_fp16.mxr\n",
+    "    !/opt/rocm/bin/migraphx-driver compile ./utilities/yolov4.onnx --gpu --enable-offload-copy --fp16 --binary -o yolov4_fp16.mxr\n",
    "if not os.path.exists(\"yolov4.mxr\"):\n",
    "    !/opt/rocm/bin/migraphx-driver compile ./utilities/yolov4.onnx --gpu --enable-offload-copy --binary -o yolov4.mxr"
   ]

--- a/src/driver/alexnet.cpp
+++ b/src/driver/alexnet.cpp
 /*
 * The MIT License (MIT)
 *
@@ -21,10 +22,10 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/operators.hpp>
+#include <migraphx/make_op.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/generate.hpp>
-#include <migraphx/apply_alpha_beta.hpp>
+#include <migraphx/json.hpp>
 #include "models.hpp"
 namespace migraphx {
@@ -34,173 +35,189 @@ inline namespace MIGRAPHX_INLINE_NS {
 migraphx::program alexnet(unsigned batch) // NOLINT(readability-function-size)
 {
    migraphx::program p;
-    auto* mm = p.get_main_module();
+    migraphx::module_ref mmain = p.get_main_module();
-    auto m0 =
+    auto x_main_module_0       = mmain->add_literal(migraphx::abs(
-        mm->add_parameter("0", migraphx::shape{migraphx::shape::float_type, {batch, 3, 224, 224}});
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {1}}, 0)));
-    auto mx0 = mm->add_literal(
+    auto x_main_module_1       = mmain->add_literal(migraphx::abs(
-        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {1000}}, 0));
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {1}}, 1)));
-    auto mx1 = mm->add_literal(
+    auto x_main_module_2       = mmain->add_literal(migraphx::abs(
-        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {1000, 4096}}, 1));
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {1}}, 2)));
-    auto mx2 = mm->add_literal(
+    auto x_input_1             = mmain->add_parameter(
-        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {4096}}, 2));
+        "input.1", migraphx::shape{migraphx::shape::float_type, {batch, 3, 224, 224}});
-    auto mx3 = mm->add_literal(
+    auto x_main_module_4 = mmain->add_literal(
        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {4096, 4096}}, 3));
-    auto mx4 = mm->add_literal(
+    auto x_main_module_5 = mmain->add_literal(
        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {4096}}, 4));
-    auto mx5 = mm->add_literal(
+    auto x_main_module_6 = mmain->add_literal(
        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {4096, 9216}}, 5));
-    auto mx6 = mm->add_literal(
+    auto x_main_module_7 = mmain->add_literal(
-        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {256}}, 6));
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {4096}}, 6));
-    auto mx7 = mm->add_literal(migraphx::generate_literal(
+    auto x_main_module_8 = mmain->add_literal(
-        migraphx::shape{migraphx::shape::float_type, {256, 256, 3, 3}}, 7));
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {1000, 4096}}, 7));
-    auto mx8 = mm->add_literal(
+    auto x_main_module_9 = mmain->add_literal(
-        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {256}}, 8));
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {1000}}, 8));
-    auto mx9  = mm->add_literal(migraphx::generate_literal(
+    auto x_main_module_10 = mmain->add_literal(migraphx::generate_literal(
        migraphx::shape{migraphx::shape::float_type, {256, 384, 3, 3}}, 9));
-    auto mx10 = mm->add_literal(
+    auto x_main_module_11 = mmain->add_literal(
-        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {384}}, 10));
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {256}}, 10));
-    auto mx11 = mm->add_literal(migraphx::generate_literal(
+    auto x_main_module_12 = mmain->add_literal(migraphx::generate_literal(
        migraphx::shape{migraphx::shape::float_type, {384, 192, 3, 3}}, 11));
-    auto mx12 = mm->add_literal(
+    auto x_main_module_13 = mmain->add_literal(
-        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {192}}, 12));
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {384}}, 12));
-    auto mx13 = mm->add_literal(migraphx::generate_literal(
+    auto x_main_module_14 = mmain->add_literal(migraphx::generate_literal(
        migraphx::shape{migraphx::shape::float_type, {192, 64, 5, 5}}, 13));
-    auto mx14 = mm->add_literal(
+    auto x_main_module_15 = mmain->add_literal(
-        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {64}}, 14));
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {192}}, 14));
-    auto mx15 = mm->add_literal(migraphx::generate_literal(
+    auto x_main_module_16 = mmain->add_literal(migraphx::generate_literal(
-        migraphx::shape{migraphx::shape::float_type, {64, 3, 11, 11}}, 15));
+        migraphx::shape{migraphx::shape::float_type, {256, 256, 3, 3}}, 15));
-    migraphx::op::convolution convolution16;
+    auto x_main_module_17 = mmain->add_literal(
-    convolution16.padding  = {2, 2};
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {256}}, 16));
-    convolution16.stride   = {4, 4};
+    auto x_main_module_18 = mmain->add_literal(migraphx::generate_literal(
-    convolution16.dilation = {1, 1};
+        migraphx::shape{migraphx::shape::float_type, {64, 3, 11, 11}}, 17));
-    convolution16.group    = 1;
+    auto x_main_module_19 = mmain->add_literal(
-    auto mx16              = mm->add_instruction(convolution16, m0, mx15);
+        migraphx::generate_literal(migraphx::shape{migraphx::shape::float_type, {64}}, 18));
-    migraphx::op::broadcast broadcast17;
+    auto x_main_module_20 = mmain->add_instruction(
-    broadcast17.axis           = 1;
+        migraphx::make_op(
-    broadcast17.broadcast_lens = {batch, 64, 55, 55};
+            "convolution",
-    auto mx17                  = mm->add_instruction(broadcast17, mx14);
+            migraphx::from_json_string(
-    migraphx::op::add add18;
+                "{dilation:[1,1],group:1,padding:[2,2,2,2],padding_mode:0,stride:[4,4]}")),
-    auto mx18 = mm->add_instruction(add18, mx16, mx17);
+        x_input_1,
-    migraphx::op::relu relu19;
+        x_main_module_18);
-    auto mx19 = mm->add_instruction(relu19, mx18);
+    auto x_main_module_21 = mmain->add_instruction(
-    migraphx::op::pooling pooling20;
+        migraphx::make_op("broadcast",
-    pooling20.mode    = migraphx::op::pooling_mode::max;
+                          migraphx::from_json_string("{axis:1,out_lens:[1,64,55,55]}")),
-    pooling20.padding = {0, 0};
+        x_main_module_19);
-    pooling20.stride  = {2, 2};
+    auto x_main_module_22 =
-    pooling20.lengths = {3, 3};
+        mmain->add_instruction(migraphx::make_op("add"), x_main_module_20, x_main_module_21);
-    auto mx20         = mm->add_instruction(pooling20, mx19);
+    auto x_main_module_23 = mmain->add_instruction(migraphx::make_op("relu"), x_main_module_22);
-    migraphx::op::convolution convolution21;
+    auto x_main_module_24 = mmain->add_instruction(
-    convolution21.padding  = {2, 2};
+        migraphx::make_op(
-    convolution21.stride   = {1, 1};
+            "pooling",
-    convolution21.dilation = {1, 1};
+            migraphx::from_json_string(
-    convolution21.group    = 1;
+                "{ceil_mode:0,lengths:[3,3],lp_order:2,mode:1,padding:[0,0,0,0],stride:[2,2]}")),
-    auto mx21              = mm->add_instruction(convolution21, mx20, mx13);
+        x_main_module_23);
-    migraphx::op::broadcast broadcast22;
+    auto x_main_module_25 = mmain->add_instruction(
-    broadcast22.axis           = 1;
+        migraphx::make_op(
-    broadcast22.broadcast_lens = {batch, 192, 27, 27};
+            "convolution",
-    auto mx22                  = mm->add_instruction(broadcast22, mx12);
+            migraphx::from_json_string(
-    migraphx::op::add add23;
+                "{dilation:[1,1],group:1,padding:[2,2,2,2],padding_mode:0,stride:[1,1]}")),
-    auto mx23 = mm->add_instruction(add23, mx21, mx22);
+        x_main_module_24,
-    migraphx::op::relu relu24;
+        x_main_module_14);
-    auto mx24 = mm->add_instruction(relu24, mx23);
+    auto x_main_module_26 = mmain->add_instruction(
-    migraphx::op::pooling pooling25;
+        migraphx::make_op("broadcast",
-    pooling25.mode    = migraphx::op::pooling_mode::max;
+                          migraphx::from_json_string("{axis:1,out_lens:[1,192,27,27]}")),
-    pooling25.padding = {0, 0};
+        x_main_module_15);
-    pooling25.stride  = {2, 2};
+    auto x_main_module_27 =
-    pooling25.lengths = {3, 3};
+        mmain->add_instruction(migraphx::make_op("add"), x_main_module_25, x_main_module_26);
-    auto mx25         = mm->add_instruction(pooling25, mx24);
+    auto x_main_module_28 = mmain->add_instruction(migraphx::make_op("relu"), x_main_module_27);
-    migraphx::op::convolution convolution26;
+    auto x_main_module_29 = mmain->add_instruction(
-    convolution26.padding  = {1, 1};
+        migraphx::make_op(
-    convolution26.stride   = {1, 1};
+            "pooling",
-    convolution26.dilation = {1, 1};
+            migraphx::from_json_string(
-    convolution26.group    = 1;
+                "{ceil_mode:0,lengths:[3,3],lp_order:2,mode:1,padding:[0,0,0,0],stride:[2,2]}")),
-    auto mx26              = mm->add_instruction(convolution26, mx25, mx11);
+        x_main_module_28);
-    migraphx::op::broadcast broadcast27;
+    auto x_main_module_30 = mmain->add_instruction(
-    broadcast27.axis           = 1;
+        migraphx::make_op(
-    broadcast27.broadcast_lens = {batch, 384, 13, 13};
+            "convolution",
-    auto mx27                  = mm->add_instruction(broadcast27, mx10);
+            migraphx::from_json_string(
-    migraphx::op::add add28;
+                "{dilation:[1,1],group:1,padding:[1,1,1,1],padding_mode:0,stride:[1,1]}")),
-    auto mx28 = mm->add_instruction(add28, mx26, mx27);
+        x_main_module_29,
-    migraphx::op::relu relu29;
+        x_main_module_12);
-    auto mx29 = mm->add_instruction(relu29, mx28);
+    auto x_main_module_31 = mmain->add_instruction(
-    migraphx::op::convolution convolution30;
+        migraphx::make_op("broadcast",
-    convolution30.padding  = {1, 1};
+                          migraphx::from_json_string("{axis:1,out_lens:[1,384,13,13]}")),
-    convolution30.stride   = {1, 1};
+        x_main_module_13);
-    convolution30.dilation = {1, 1};
+    auto x_main_module_32 =
-    convolution30.group    = 1;
+        mmain->add_instruction(migraphx::make_op("add"), x_main_module_30, x_main_module_31);
-    auto mx30              = mm->add_instruction(convolution30, mx29, mx9);
+    auto x_main_module_33 = mmain->add_instruction(migraphx::make_op("relu"), x_main_module_32);
-    migraphx::op::broadcast broadcast31;
+    auto x_main_module_34 = mmain->add_instruction(
-    broadcast31.axis           = 1;
+        migraphx::make_op(
-    broadcast31.broadcast_lens = {batch, 256, 13, 13};
+            "convolution",
-    auto mx31                  = mm->add_instruction(broadcast31, mx8);
+            migraphx::from_json_string(
-    migraphx::op::add add32;
+                "{dilation:[1,1],group:1,padding:[1,1,1,1],padding_mode:0,stride:[1,1]}")),
-    auto mx32 = mm->add_instruction(add32, mx30, mx31);
+        x_main_module_33,
-    migraphx::op::relu relu33;
+        x_main_module_10);
-    auto mx33 = mm->add_instruction(relu33, mx32);
+    auto x_main_module_35 = mmain->add_instruction(
-    migraphx::op::convolution convolution34;
+        migraphx::make_op("broadcast",
-    convolution34.padding  = {1, 1};
+                          migraphx::from_json_string("{axis:1,out_lens:[1,256,13,13]}")),
-    convolution34.stride   = {1, 1};
+        x_main_module_11);
-    convolution34.dilation = {1, 1};
+    auto x_main_module_36 =
-    convolution34.group    = 1;
+        mmain->add_instruction(migraphx::make_op("add"), x_main_module_34, x_main_module_35);
-    auto mx34              = mm->add_instruction(convolution34, mx33, mx7);
+    auto x_main_module_37 = mmain->add_instruction(migraphx::make_op("relu"), x_main_module_36);
-    migraphx::op::broadcast broadcast35;
+    auto x_main_module_38 = mmain->add_instruction(
-    broadcast35.axis           = 1;
+        migraphx::make_op(
-    broadcast35.broadcast_lens = {batch, 256, 13, 13};
+            "convolution",
-    auto mx35                  = mm->add_instruction(broadcast35, mx6);
+            migraphx::from_json_string(
-    migraphx::op::add add36;
+                "{dilation:[1,1],group:1,padding:[1,1,1,1],padding_mode:0,stride:[1,1]}")),
-    auto mx36 = mm->add_instruction(add36, mx34, mx35);
+        x_main_module_37,
-    migraphx::op::relu relu37;
+        x_main_module_16);
-    auto mx37 = mm->add_instruction(relu37, mx36);
+    auto x_main_module_39 = mmain->add_instruction(
-    migraphx::op::pooling pooling38;
+        migraphx::make_op("broadcast",
-    pooling38.mode    = migraphx::op::pooling_mode::max;
+                          migraphx::from_json_string("{axis:1,out_lens:[1,256,13,13]}")),
-    pooling38.padding = {0, 0};
+        x_main_module_17);
-    pooling38.stride  = {2, 2};
+    auto x_main_module_40 =
-    pooling38.lengths = {3, 3};
+        mmain->add_instruction(migraphx::make_op("add"), x_main_module_38, x_main_module_39);
-    auto mx38         = mm->add_instruction(pooling38, mx37);
+    auto x_main_module_41 = mmain->add_instruction(migraphx::make_op("relu"), x_main_module_40);
-    migraphx::op::flatten flatten39;
+    auto x_main_module_42 = mmain->add_instruction(
-    flatten39.axis = 1;
+        migraphx::make_op(
-    auto mx39      = mm->add_instruction(flatten39, mx38);
+            "pooling",
-    migraphx::op::identity identity40;
+            migraphx::from_json_string(
-    auto mx40 = mm->add_instruction(identity40, mx39);
+                "{ceil_mode:0,lengths:[3,3],lp_order:2,mode:1,padding:[0,0,0,0],stride:[2,2]}")),
-    migraphx::op::transpose transpose41;
+        x_main_module_41);
-    transpose41.dims = {1, 0};
+    auto x_main_module_43 = mmain->add_instruction(
-    auto mx41        = mm->add_instruction(transpose41, mx5);
+        migraphx::make_op("reshape", migraphx::from_json_string("{dims:[1,9216]}")),
-    migraphx::op::multibroadcast multibroadcast42;
+        x_main_module_42);
-    multibroadcast42.output_lens = {batch, 4096};
+    auto x_main_module_44 = mmain->add_instruction(
-    auto mx42                    = mm->add_instruction(multibroadcast42, mx4);
+        migraphx::make_op("transpose", migraphx::from_json_string("{permutation:[1,0]}")),
-    float dot43_alpha            = 1;
+        x_main_module_6);
-    float dot43_beta             = 1;
+    auto x_main_module_45 =
-    auto mx43                    = migraphx::add_apply_alpha_beta(
+        mmain->add_instruction(migraphx::make_op("dot"), x_main_module_43, x_main_module_44);
-        *mm, {mx40, mx41, mx42}, migraphx::make_op("dot"), dot43_alpha, dot43_beta);
+    auto x_main_module_46 = mmain->add_instruction(
-    migraphx::op::relu relu44;
+        migraphx::make_op("multibroadcast", migraphx::from_json_string("{out_lens:[1,4096]}")),
-    auto mx44 = mm->add_instruction(relu44, mx43);
+        x_main_module_7);
-    migraphx::op::identity identity45;
+    auto x_main_module_47 = mmain->add_instruction(
-    auto mx45 = mm->add_instruction(identity45, mx44);
+        migraphx::make_op("multibroadcast", migraphx::from_json_string("{out_lens:[1,4096]}")),
-    migraphx::op::transpose transpose46;
+        x_main_module_2);
-    transpose46.dims = {1, 0};
+    auto x_main_module_48 =
-    auto mx46        = mm->add_instruction(transpose46, mx3);
+        mmain->add_instruction(migraphx::make_op("mul"), x_main_module_46, x_main_module_47);
-    migraphx::op::multibroadcast multibroadcast47;
+    auto x_main_module_49 =
-    multibroadcast47.output_lens = {batch, 4096};
+        mmain->add_instruction(migraphx::make_op("add"), x_main_module_45, x_main_module_48);
-    auto mx47                    = mm->add_instruction(multibroadcast47, mx2);
+    auto x_main_module_50 = mmain->add_instruction(migraphx::make_op("relu"), x_main_module_49);
-    float dot48_alpha            = 1;
+    auto x_main_module_51 = mmain->add_instruction(
-    float dot48_beta             = 1;
+        migraphx::make_op("transpose", migraphx::from_json_string("{permutation:[1,0]}")),
-    auto mx48                    = migraphx::add_apply_alpha_beta(
+        x_main_module_4);
-        *mm, {mx45, mx46, mx47}, migraphx::make_op("dot"), dot48_alpha, dot48_beta);
+    auto x_main_module_52 =
-    migraphx::op::relu relu49;
+        mmain->add_instruction(migraphx::make_op("dot"), x_main_module_50, x_main_module_51);
-    auto mx49 = mm->add_instruction(relu49, mx48);
+    auto x_main_module_53 = mmain->add_instruction(
-    migraphx::op::transpose transpose50;
+        migraphx::make_op("multibroadcast", migraphx::from_json_string("{out_lens:[1,4096]}")),
-    transpose50.dims = {1, 0};
+        x_main_module_5);
-    auto mx50        = mm->add_instruction(transpose50, mx1);
+    auto x_main_module_54 = mmain->add_instruction(
-    migraphx::op::multibroadcast multibroadcast51;
+        migraphx::make_op("multibroadcast", migraphx::from_json_string("{out_lens:[1,4096]}")),
-    multibroadcast51.output_lens = {batch, 1000};
+        x_main_module_1);
-    auto mx51                    = mm->add_instruction(multibroadcast51, mx0);
+    auto x_main_module_55 =
-    float dot52_alpha            = 1;
+        mmain->add_instruction(migraphx::make_op("mul"), x_main_module_53, x_main_module_54);
-    float dot52_beta             = 1;
+    auto x_main_module_56 =
-    migraphx::add_apply_alpha_beta(
+        mmain->add_instruction(migraphx::make_op("add"), x_main_module_52, x_main_module_55);
-        *mm, {mx49, mx50, mx51}, migraphx::make_op("dot"), dot52_alpha, dot52_beta);
+    auto x_main_module_57 = mmain->add_instruction(migraphx::make_op("relu"), x_main_module_56);
+    auto x_main_module_58 = mmain->add_instruction(
+        migraphx::make_op("transpose", migraphx::from_json_string("{permutation:[1,0]}")),
+        x_main_module_8);
+    auto x_main_module_59 =
+        mmain->add_instruction(migraphx::make_op("dot"), x_main_module_57, x_main_module_58);
+    auto x_main_module_60 = mmain->add_instruction(
+        migraphx::make_op("multibroadcast", migraphx::from_json_string("{out_lens:[1,1000]}")),
+        x_main_module_9);
+    auto x_main_module_61 = mmain->add_instruction(
+        migraphx::make_op("multibroadcast", migraphx::from_json_string("{out_lens:[1,1000]}")),
+        x_main_module_0);
+    auto x_main_module_62 =
+        mmain->add_instruction(migraphx::make_op("mul"), x_main_module_60, x_main_module_61);
+    auto x_main_module_63 =
+        mmain->add_instruction(migraphx::make_op("add"), x_main_module_59, x_main_module_62);
+    mmain->add_return({x_main_module_63});
    return p;
 }
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace driver
 } // namespace migraphx
--- a/src/driver/inceptionv3.cpp
+++ b/src/driver/inceptionv3.cpp
--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -210,6 +210,9 @@ struct loader
            auto last = std::prev(mm->end(), trim);
            mm->remove_instructions(last, mm->end());
        }
+        // Remove unused variable when exporting to cpp
+        if(output_type == "cpp")
+            migraphx::run_passes(*p.get_main_module(), {migraphx::dead_code_elimination{}});
        if(optimize)
        {
            migraphx::run_passes(*p.get_main_module(),

--- a/src/driver/resnet50.cpp
+++ b/src/driver/resnet50.cpp
--- a/src/fuse_pointwise.cpp
+++ b/src/fuse_pointwise.cpp
@@ -142,7 +142,7 @@ static std::vector<instruction_ref> append_pointwise_module(instruction_ref ins,
            input_map[input] = map_ins[param];
        }
    }
-    pm->replace_return(pm->insert_module_instructions(last, xm, map_ins));
+    pm->replace_return(pm->insert_instructions(last, xm, map_ins));
    return inputs;
 }

--- a/src/include/migraphx/module.hpp
+++ b/src/include/migraphx/module.hpp
@@ -120,9 +120,33 @@ struct module
    instruction_ref move_instructions(instruction_ref src, instruction_ref dst);
    std::vector<instruction_ref>
-    insert_module_instructions(instruction_ref ins,
+    add_instructions(const std::vector<instruction_ref>& instructions,
-                               module_ref m,
+                     std::unordered_map<instruction_ref, instruction_ref> map_ins = {});
-                               std::unordered_map<instruction_ref, instruction_ref> map_ins = {});
+    std::vector<instruction_ref>
+    add_instructions(module_ref m,
+                     std::unordered_map<instruction_ref, instruction_ref> map_ins = {});
+    std::vector<instruction_ref>
+    add_instructions(instruction_ref start,
+                     instruction_ref last,
+                     std::unordered_map<instruction_ref, instruction_ref> map_ins = {});
+    std::vector<instruction_ref>
+    insert_instructions(instruction_ref ins,
+                        const std::vector<instruction_ref>& instructions,
+                        std::unordered_map<instruction_ref, instruction_ref> map_ins = {});
+    std::vector<instruction_ref>
+    insert_instructions(instruction_ref ins,
+                        module_ref m,
+                        std::unordered_map<instruction_ref, instruction_ref> map_ins = {});
+    std::vector<instruction_ref>
+    insert_instructions(instruction_ref ins,
+                        instruction_ref start,
+                        instruction_ref last,
+                        std::unordered_map<instruction_ref, instruction_ref> map_ins = {});
    template <class... Ts>
    instruction_ref add_literal(Ts&&... xs)
@@ -179,7 +203,9 @@ struct module
    void print_cpp(std::ostream& os) const;
    std::unordered_map<instruction_ref, std::string>
-    print_cpp(std::ostream& os, std::unordered_map<instruction_ref, std::string> names) const;
+    print_cpp(std::ostream& os,
+              const std::string& mname,
+              std::unordered_map<instruction_ref, std::string> names) const;
    void annotate(std::ostream& os, std::function<void(instruction_ref)> a) const;

--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -196,43 +196,64 @@ struct nonmaxsuppression
        return boxes_heap;
    }
-    template <class H, class S>
+    template <class Output, class Boxes, class Scores>
-    void select_boxes(H& boxes_heap,
+    std::size_t compute_nms(Output output,
-                      std::vector<std::pair<double, int64_t>>& selected_boxes_inside_class,
+                            Boxes boxes,
-                      std::vector<int64_t>& selected_indices,
+                            Scores scores,
-                      S batch_boxes_start,
+                            const shape& max_output_shape,
-                      std::size_t max_output_boxes_per_class,
+                            std::size_t max_output_boxes_per_class,
-                      double iou_threshold,
+                            double iou_threshold,
-                      std::size_t batch_idx,
+                            double score_threshold) const
-                      std::size_t class_idx) const
    {
-        selected_boxes_inside_class.clear();
+        std::fill(output.begin(), output.end(), 0);
-        // Get the next box with top score, filter by iou_threshold
+        const auto& lens       = scores.get_shape().lens();
-        while(!boxes_heap.empty() &&
+        const auto num_batches = lens[0];
-              selected_boxes_inside_class.size() < max_output_boxes_per_class)
+        const auto num_classes = lens[1];
-        {
+        const auto num_boxes   = lens[2];
-            // Check with existing selected boxes for this class, remove box if it
+        // boxes of a class with NMS applied [score, index]
-            // exceeds the IOU (Intersection Over Union) threshold
+        std::vector<std::pair<double, int64_t>> selected_boxes_inside_class;
-            const auto next_top_score = boxes_heap.top();
+        std::vector<int64_t> selected_indices;
-            bool not_selected =
+        selected_boxes_inside_class.reserve(max_output_shape.elements());
-                std::any_of(selected_boxes_inside_class.begin(),
+        // iterate over batches and classes
-                            selected_boxes_inside_class.end(),
+        shape comp_s{shape::double_type, {num_batches, num_classes}};
-                            [&](auto selected_index) {
+        shape_for_each(comp_s, [&](auto idx) {
-                                return this->suppress_by_iou(
+            auto batch_idx = idx[0];
-                                    batch_box(batch_boxes_start, next_top_score.second),
+            auto class_idx = idx[1];
-                                    batch_box(batch_boxes_start, selected_index.second),
+            // index offset for this class
-                                    iou_threshold);
+            auto scores_start = scores.begin() + (batch_idx * num_classes + class_idx) * num_boxes;
-                            });
+            // iterator to first value of this batch
+            auto batch_boxes_start = boxes.begin() + batch_idx * num_boxes * 4;
-            if(not not_selected)
+            auto boxes_heap = filter_boxes_by_score(scores_start, num_boxes, score_threshold);
+            selected_boxes_inside_class.clear();
+            // Get the next box with top score, filter by iou_threshold
+            while(!boxes_heap.empty() &&
+                  selected_boxes_inside_class.size() < max_output_boxes_per_class)
            {
-                selected_boxes_inside_class.push_back(next_top_score);
+                // Check with existing selected boxes for this class, remove box if it
-                selected_indices.push_back(batch_idx);
+                // exceeds the IOU (Intersection Over Union) threshold
-                selected_indices.push_back(class_idx);
+                const auto next_top_score = boxes_heap.top();
-                selected_indices.push_back(next_top_score.second);
+                bool not_selected =
+                    std::any_of(selected_boxes_inside_class.begin(),
+                                selected_boxes_inside_class.end(),
+                                [&](auto selected_index) {
+                                    return this->suppress_by_iou(
+                                        batch_box(batch_boxes_start, next_top_score.second),
+                                        batch_box(batch_boxes_start, selected_index.second),
+                                        iou_threshold);
+                                });
+                if(not not_selected)
+                {
+                    selected_boxes_inside_class.push_back(next_top_score);
+                    selected_indices.push_back(batch_idx);
+                    selected_indices.push_back(class_idx);
+                    selected_indices.push_back(next_top_score.second);
+                }
+                boxes_heap.pop();
            }
-            boxes_heap.pop();
+        });
-        }
+        std::copy(selected_indices.begin(), selected_indices.end(), output.begin());
+        return selected_indices.size() / 3;
    }
    argument compute(const shape& output_shape, std::vector<argument> args) const
@@ -253,40 +274,15 @@ struct nonmaxsuppression
        result.visit([&](auto output) {
            visit_all(args[0], args[1])([&](auto boxes, auto scores) {
-                const auto& lens       = scores.get_shape().lens();
+                num_selected = compute_nms(output,
-                const auto num_batches = lens[0];
+                                           boxes,
-                const auto num_classes = lens[1];
+                                           scores,
-                const auto num_boxes   = lens[2];
+                                           max_output_shape,
-                // boxes of a class with NMS applied [score, index]
+                                           max_output_boxes_per_class,
-                std::vector<std::pair<double, int64_t>> selected_boxes_inside_class;
+                                           iou_threshold,
-                std::vector<int64_t> selected_indices;
+                                           score_threshold);
-                selected_boxes_inside_class.reserve(max_output_shape.elements());
-                // iterate over batches and classes
-                shape comp_s{shape::double_type, {num_batches, num_classes}};
-                shape_for_each(comp_s, [&](auto idx) {
-                    auto batch_idx = idx[0];
-                    auto class_idx = idx[1];
-                    // index offset for this class
-                    auto scores_start =
-                        scores.begin() + (batch_idx * num_classes + class_idx) * num_boxes;
-                    // iterator to first value of this batch
-                    auto batch_boxes_start = boxes.begin() + batch_idx * num_boxes * 4;
-                    auto boxes_heap =
-                        filter_boxes_by_score(scores_start, num_boxes, score_threshold);
-                    select_boxes(boxes_heap,
-                                 selected_boxes_inside_class,
-                                 selected_indices,
-                                 batch_boxes_start,
-                                 max_output_boxes_per_class,
-                                 iou_threshold,
-                                 batch_idx,
-                                 class_idx);
-                });
-                std::copy(selected_indices.begin(), selected_indices.end(), output.begin());
-                num_selected = selected_indices.size() / 3;
            });
        });
        return result.reshape({output_shape.type(), {num_selected, 3}});
    }
 };

--- a/src/include/migraphx/stringutils.hpp
+++ b/src/include/migraphx/stringutils.hpp
@@ -44,8 +44,8 @@ auto with_char(F f)
    return [=](unsigned char c) -> bool { return f(c); };
 }
-inline std::string
+inline void
-replace_string(std::string subject, const std::string& search, const std::string& replace)
+replace_string_inplace(std::string& subject, const std::string& search, const std::string& replace)
 {
    size_t pos = 0;
    while((pos = subject.find(search, pos)) != std::string::npos)
@@ -53,6 +53,12 @@ replace_string(std::string subject, const std::string& search, const std::string
        subject.replace(pos, search.length(), replace);
        pos += replace.length();
    }
+}
+inline std::string
+replace_string(std::string subject, const std::string& search, const std::string& replace)
+{
+    replace_string_inplace(subject, search, replace);
    return subject;
 }

--- a/src/inline_module.cpp
+++ b/src/inline_module.cpp
@@ -35,7 +35,7 @@ static void inline_submodule(module& m, instruction_ref ins, bool cond)
 {
    const auto& mod_inputs = ins->module_inputs();
    module_ref smod        = cond ? mod_inputs.at(0) : mod_inputs.at(1);
-    auto mod_outputs       = m.insert_module_instructions(ins, smod);
+    auto mod_outputs       = m.insert_instructions(ins, smod);
    auto ins_outputs = ins->outputs();
    assert(mod_outputs.size() >= ins_outputs.size());

--- a/src/module.cpp
+++ b/src/module.cpp
@@ -35,6 +35,7 @@
 #include <migraphx/make_op.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/json.hpp>
 #include <iostream>
 #include <sstream>
 #include <algorithm>
@@ -196,6 +197,62 @@ void module::assign(const module& m)
    }
 }
+template <class Range>
+static std::vector<instruction_ref>
+insert_generic_instructions(module& m,
+                            instruction_ref ins,
+                            Range&& instructions,
+                            std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    assert(m.has_instruction(ins) or is_end(ins, m.end()));
+    std::vector<instruction_ref> mod_outputs;
+    instruction_ref last;
+    for(instruction_ref sins : instructions)
+    {
+        last = sins;
+        if(contains(map_ins, sins))
+            continue;
+        instruction_ref copy_ins;
+        if(sins->name() == "@literal")
+        {
+            auto l   = sins->get_literal();
+            copy_ins = m.add_literal(l);
+        }
+        else if(sins->name() == "@param")
+        {
+            auto&& name = any_cast<builtin::param>(sins->get_operator()).parameter;
+            auto s      = sins->get_shape();
+            copy_ins    = m.add_parameter(name, s);
+        }
+        else if(sins->name() == "@outline")
+        {
+            auto s   = sins->get_shape();
+            copy_ins = m.add_outline(s);
+        }
+        else
+        {
+            auto mod_args = sins->module_inputs();
+            auto inputs   = sins->inputs();
+            std::vector<instruction_ref> copy_inputs(inputs.size());
+            std::transform(inputs.begin(), inputs.end(), copy_inputs.begin(), [&](auto i) {
+                return contains(map_ins, i) ? map_ins[i] : i;
+            });
+            if(sins->name() == "@return")
+            {
+                mod_outputs = copy_inputs;
+                break;
+            }
+            copy_ins = m.insert_instruction(ins, sins->get_operator(), copy_inputs, mod_args);
+        }
+        map_ins[sins] = copy_ins;
+    }
+    if(mod_outputs.empty() and instructions.begin() != instructions.end())
+        mod_outputs = {map_ins.at(last)};
+    return mod_outputs;
+}
 instruction_ref module::add_instruction(const operation& op, std::vector<instruction_ref> args)
 {
    return insert_instruction(impl->instructions.end(), op, std::move(args));
@@ -334,53 +391,49 @@ instruction_ref module::move_instructions(instruction_ref src, instruction_ref d
    return src;
 }
-std::vector<instruction_ref> module::insert_module_instructions(
+std::vector<instruction_ref>
-    instruction_ref ins, module_ref m, std::unordered_map<instruction_ref, instruction_ref> map_ins)
+module::add_instructions(const std::vector<instruction_ref>& instructions,
+                         std::unordered_map<instruction_ref, instruction_ref> map_ins)
 {
-    std::vector<instruction_ref> mod_outputs;
+    return this->insert_instructions(this->end(), instructions, std::move(map_ins));
-    for(auto sins : iterator_for(*m))
+}
-    {
-        if(contains(map_ins, sins))
-            continue;
-        instruction_ref copy_ins;
-        if(sins->name() == "@literal")
-        {
-            auto l   = sins->get_literal();
-            copy_ins = this->add_literal(l);
-        }
-        else if(sins->name() == "@param")
-        {
-            auto&& name = any_cast<builtin::param>(sins->get_operator()).parameter;
-            auto s      = sins->get_shape();
-            copy_ins    = this->add_parameter(name, s);
-        }
-        else if(sins->name() == "@outline")
-        {
-            auto s   = sins->get_shape();
-            copy_ins = this->add_outline(s);
-        }
-        else
-        {
-            auto mod_args = sins->module_inputs();
-            auto inputs   = sins->inputs();
-            std::vector<instruction_ref> copy_inputs(inputs.size());
-            std::transform(inputs.begin(), inputs.end(), copy_inputs.begin(), [&](auto i) {
-                return contains(map_ins, i) ? map_ins[i] : i;
-            });
-            if(sins->name() == "@return")
+std::vector<instruction_ref>
-            {
+module::add_instructions(module_ref m, std::unordered_map<instruction_ref, instruction_ref> map_ins)
-                mod_outputs = copy_inputs;
+{
-                break;
+    return this->insert_instructions(this->end(), m, std::move(map_ins));
-            }
+}
-            copy_ins = this->insert_instruction(ins, sins->get_operator(), copy_inputs, mod_args);
+std::vector<instruction_ref>
-        }
+module::add_instructions(instruction_ref start,
-        map_ins[sins] = copy_ins;
+                         instruction_ref last,
-    }
+                         std::unordered_map<instruction_ref, instruction_ref> map_ins)
-    if(mod_outputs.empty())
+{
-        mod_outputs = {map_ins.at(std::prev(m->end()))};
+    return this->insert_instructions(this->end(), start, last, std::move(map_ins));
-    return mod_outputs;
+}
+std::vector<instruction_ref>
+module::insert_instructions(instruction_ref ins,
+                            const std::vector<instruction_ref>& instructions,
+                            std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    return insert_generic_instructions(*this, ins, instructions, std::move(map_ins));
+}
+std::vector<instruction_ref> module::insert_instructions(
+    instruction_ref ins, module_ref m, std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    return insert_generic_instructions(*this, ins, iterator_for(*m), std::move(map_ins));
+}
+std::vector<instruction_ref>
+module::insert_instructions(instruction_ref ins,
+                            instruction_ref start,
+                            instruction_ref last,
+                            std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    auto r = range(start, last);
+    return insert_generic_instructions(*this, ins, iterator_for(r), std::move(map_ins));
 }
 instruction_ref module::add_literal(literal l)
@@ -706,44 +759,33 @@ void module::print_graph(std::ostream& os, bool brief) const
    os << "}" << std::endl;
 }
-static std::string cpp_var_name(const std::string& name)
+static std::string to_c_id(const std::string& name, char rep = '_')
 {
-    return "m" + replace_string(name, "@", "x");
+    std::string id = transform_string(name, [&](auto c) {
+        if(with_char(::isalnum)(c) or c == '_')
+            return c;
+        return rep;
+    });
+    while(contains(id, "__"))
+        replace_string_inplace(id, "__", "_");
+    return id;
 }
-static std::string cpp_op_var(const std::string& name, instruction_ref ins)
+static std::string cpp_var_name(const std::string& name)
 {
-    return replace_string(name, "@", ins->name());
+    return to_c_id("x_" + replace_string(name, ":", "_module_"));
 }
-static void print_op_attributes(std::ostream& os, const std::string& name, const operation& op)
+static void print_make_op(std::ostream& os, const operation& op)
 {
-    std::string x = to_string(op);
+    os << "migraphx::make_op(" << enclose_name(op.name());
-    if(contains(x, "["))
+    auto v = op.to_value();
+    if(not v.empty())
    {
-        auto start                 = x.find('[');
+        os << ", "
-        auto end                   = x.find(']');
+           << "migraphx::from_json_string(" << enclose_name(to_json_string(v)) << ")";
-        std::string attribute_text = x.substr(start + 1, end - start - 1);
-        std::vector<std::string> attributes;
-        for(auto&& attribute : split_string(attribute_text, ','))
-        {
-            if(contains(attribute, '='))
-                attributes.push_back(attribute);
-            else
-                attributes.back() += "," + attribute;
-        }
-        for(auto&& attribute : attributes)
-        {
-            auto p     = split_string(attribute, '=');
-            auto key   = p.front();
-            auto value = p.back();
-            if(contains({"bn_mode", "padding_mode"}, key))
-                continue;
-            if(key == "mode")
-                value = enclose_name(trim(value));
-            os << name << "." << key << " = " << value << ";" << std::endl;
-        }
    }
+    os << ")";
 }
 static void print_cpp_shape(std::ostream& os, const migraphx::shape& s)
@@ -756,22 +798,25 @@ static void print_cpp_shape(std::ostream& os, const migraphx::shape& s)
 }
 std::unordered_map<instruction_ref, std::string>
-module::print_cpp(std::ostream& os, std::unordered_map<instruction_ref, std::string> names) const
+module::print_cpp(std::ostream& os,
+                  const std::string& mname,
+                  std::unordered_map<instruction_ref, std::string> names) const
 {
-    os << "migraphx::module p;" << std::endl;
+    // cppcheck-suppress variableScope
-    unsigned long seed = 0;
+    unsigned long seed = names.size();
+    auto last          = std::prev(this->end());
    names              = this->print(
        [&](auto ins, auto ins_names) {
-            auto op = cpp_op_var(ins_names.at(ins), ins);
+            std::vector<std::string> input_vars;
-            if(ins->name().front() != '@')
+            std::transform(ins->inputs().begin(),
-            {
+                           ins->inputs().end(),
-                os << "migraphx::op::" << ins->name() << " " << op << ";" << std::endl;
+                           std::back_inserter(input_vars),
-                print_op_attributes(os, op, ins->get_operator());
+                           [&](auto input) { return cpp_var_name(ins_names.at(input)); });
-            }
+            if(ins != last)
-            os << "auto " << cpp_var_name(ins_names.at(ins)) << " = ";
+                os << "auto " << cpp_var_name(ins_names.at(ins)) << " = ";
            if(ins->name() == "@literal")
            {
-                os << "p.add_literal(";
+                os << mname << "->add_literal(";
                bool use_abs = false;
                ins->get_literal().visit([&](auto v) {
                    use_abs = std::none_of(v.begin(), v.end(), [](auto x) { return x < 0; });
@@ -789,17 +834,22 @@ module::print_cpp(std::ostream& os, std::unordered_map<instruction_ref, std::str
            else if(ins->name() == "@param")
            {
                std::string name = any_cast<builtin::param>(ins->get_operator()).parameter;
-                os << "p.add_parameter(" << enclose_name(name) << ",";
+                os << mname << "->add_parameter(" << enclose_name(name) << ",";
                print_cpp_shape(os, ins->get_shape());
                os << ");" << std::endl;
            }
+            else if(ins->name() == "@return")
+            {
+                os << mname << "->add_return({";
+                os << join_strings(input_vars, ", ");
+                os << "});" << std::endl;
+            }
            else
            {
-                os << "p.add_instruction(" << op;
+                assert(ins->name().front() != '@');
-                for(auto input : ins->inputs())
+                os << mname << "->add_instruction(";
-                {
+                print_make_op(os, ins->get_operator());
-                    os << ", " << cpp_var_name(ins_names.at(input));
+                os << ", " << join_strings(input_vars, ", ");
-                }
                os << ");" << std::endl;
            }
        },
@@ -808,7 +858,7 @@ module::print_cpp(std::ostream& os, std::unordered_map<instruction_ref, std::str
    return names;
 }
-void module::print_cpp(std::ostream& os) const { this->print_cpp(os, {}); }
+void module::print_cpp(std::ostream& os) const { this->print_cpp(os, this->name(), {}); }
 void module::annotate(std::ostream& os, std::function<void(instruction_ref)> a) const
 {