Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_model_test

9b929d4e · charlie · c4b1102e · 4394e9b3 · 9b929d4e · 9b929d4e
Commit 9b929d4e authored Dec 29, 2022 by charlie
20 changed files
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -7,7 +7,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@0.6.0
+        uses: styfle/cancel-workflow-action@0.11.0
        with:
          access_token: ${{ github.token }}
  tidy:
@@ -15,9 +15,19 @@ jobs:

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-
-    - uses: actions/checkout@v2
+      run: |
+        sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku 
+        du . --max-depth=1 -h
+        ls -la
+        cd /usr/local
+        du . --max-depth=1 -h
+        ls -la
+        cd /usr/local/lib
+        echo $(pwd)
+        du . --max-depth=1 -h
+        ls -la
+
+    - uses: actions/checkout@v3

    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
@@ -34,7 +44,7 @@ jobs:
        message("::set-output name=timestamp::${current_date}")

    - name: Cache files for tidy
-      uses: pat-s/always-upload-cache@v2.1.3
+      uses: pat-s/always-upload-cache@v3.0.11
      with:
        path: tidy-cache
        key: tidy-cache-${{ steps.cache_timestamp.outputs.timestamp }}
@@ -65,8 +75,8 @@ jobs:

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku
+    - uses: actions/checkout@v3

    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
@@ -110,8 +120,8 @@ jobs:

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku 
+    - uses: actions/checkout@v3

    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
@@ -146,10 +156,10 @@ jobs:

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku
+    - uses: actions/checkout@v3
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
      with:
        python-version: 3.8
    - name: Install pyflakes
@@ -167,10 +177,10 @@ jobs:

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku
+    - uses: actions/checkout@v3
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
      with:
        python-version: 3.8
    - name: run License Check
@@ -198,16 +208,16 @@ jobs:

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws*  /usr/local/lib/heroku
+    - uses: actions/checkout@v3
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
      with:
        python-version: 3.7
    - name: Cache dependencies
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true
-      uses: actions/cache@v2
+      uses: actions/cache@v3
      with:
        # This path is specific to Ubuntu
        path: ${{ github.workspace }}/cget
@@ -294,16 +304,16 @@ jobs:

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku
+    - uses: actions/checkout@v3
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
      with:
        python-version: 3.7
    - name: Cache dependencies
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true
-      uses: actions/cache@v2
+      uses: actions/cache@v3
      with:
        # This path is specific to Ubuntu
        path: ${{ github.workspace }}/cget

--- a/.github/workflows/performance.yaml
+++ b/.github/workflows/performance.yaml
@@ -5,14 +5,14 @@ on:
    branches: [develop]
    types: [opened, synchronize, closed]
  schedule:
-    - cron: "0 5 * * 1-6"
+    - cron: "0 6 * * 1-6"

  workflow_dispatch:
    inputs:
      rocm_release:
        description: ROCm Version
        required: true
-        default: '5.2'
+        default: '5.3'
      performance_reports_repo:
        description: Result repository
        required: true
@@ -30,9 +30,9 @@ concurrency: "perftest-${{ github.head_ref ||  github.base_ref || 'schedule' }}"

 jobs:
  release:
-    uses: rocmsoftwareplatform/migraphx-benchmark/.github/workflows/perf-test.yml@main
+    uses: ROCmSoftwarePlatform/migraphx-benchmark/.github/workflows/perf-test.yml@main
    with:
-      rocm_release: ${{ github.event.inputs.rocm_release || '5.2' }}
+      rocm_release: ${{ github.event.inputs.rocm_release || '5.3' }}
      result_number: ${{ github.event.inputs.result_number || '10' }}
      flags: ${{ github.event.inputs.flags || '-s' }} 
      performance_reports_repo: ${{ github.event.inputs.performance_reports_repo || 'ROCmSoftwarePlatform/migraphx-reports' }} 

--- a/Dockerfile
+++ b/Dockerfile
@@ -74,7 +74,8 @@ RUN cget -p $PREFIX install facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cma
 RUN cget -p $PREFIX install ccache@v4.1 -DENABLE_TESTING=OFF

 # Install newer cmake for onnx runtime
-RUN cget -p /opt/cmake install kitware/cmake@v3.13.4
+ARG CMAKE_VERSION=3.24.2
+RUN cget -p /opt/cmake install -X binary https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz

 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
@@ -86,7 +87,7 @@ RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXR

 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh

-RUN cget -p /usr/local install ROCmSoftwarePlatform/llvm-project-mlir@c0723a7e50043d973cb73ae51dc30d36679ee7e5 -DBUILD_MIXR_TARGET=On
+RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@78b706fe9879587ab98b6614ae539265374a3fae -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off

 ENV MIOPEN_FIND_DB_PATH=/tmp/miopen/find-db
 ENV MIOPEN_USER_DB_PATH=/tmp/miopen/user-db

--- a/examples/migraphx/migraphx_driver/README.md
+++ b/examples/migraphx/migraphx_driver/README.md
@@ -29,6 +29,7 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --tf | Load file as a tensorflow graph |
 | --migraphx | Load file as a migraphx graph |
 | --migraphx-json | Load file as a migraphx JSON graph |
+| --batch | Set batch size for the model | 
 | --nhwc | Treat tensorflow format as nhwc | 
 | --nchw | Treat tensorflow format as nchw |
 | --skip-unknown-operators | Skip unknown operators when parsing and continue to parse |

--- a/examples/nlp/python_bert_squad/requirements_bertsquad.txt
+++ b/examples/nlp/python_bert_squad/requirements_bertsquad.txt
@@ -21,6 +21,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-tensorflow==2.7.2
+tensorflow==2.9.3
 onnxruntime
 tokenizers
\ No newline at end of file
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -55,6 +55,7 @@ add_library(migraphx
    insert_pad.cpp
    instruction.cpp
    json.cpp
+    layout_nhwc.cpp
    load_save.cpp
    make_op.cpp
    module.cpp
@@ -144,6 +145,7 @@ register_migraphx_ops(
    if_op
    im2col
    isnan
+    layout
    leaky_relu
    less
    load

--- a/src/api/include/migraphx/migraphx.hpp
+++ b/src/api/include/migraphx/migraphx.hpp
@@ -32,6 +32,7 @@
 #include <memory>
 #include <numeric>
 #include <exception>
+#include <array>
 #include <vector>
 #include <cassert>
 #include <iostream>

--- a/src/auto_contiguous.cpp
+++ b/src/auto_contiguous.cpp
@@ -59,6 +59,8 @@ void auto_contiguous::apply(module& m) const
    auto last = std::prev(m.end());
    for(auto ins : iterator_for(m))
    {
+        if(ins->name() == "layout")
+            continue;
        // for last instruction that is NOT a return
        if(ins->outputs().empty() and ins != last)
            continue;

--- a/src/common.cpp
+++ b/src/common.cpp
@@ -68,20 +68,15 @@ std::vector<std::size_t> compute_broadcasted_lens(std::vector<std::size_t> s0,

 std::vector<shape::dynamic_dimension> compute_broadcasted_dyn_dims(shape s0, shape s1)
 {
-    assert(s0.dynamic() or s1.dynamic());
    // change both shapes to dynamic_dimension representation
-    if(not s0.dynamic())
-        s0 = s0.to_dynamic();
-    if(not s1.dynamic())
-        s1 = s1.to_dynamic();
-
+    s0 = s0.to_dynamic();
+    s1 = s1.to_dynamic();
    if(s0.ndim() > s1.ndim())
    {
        std::swap(s0, s1);
    }
    auto offset = s1.ndim() - s0.ndim();
    std::vector<shape::dynamic_dimension> out_dims(s1.dyn_dims());
-    shape::dynamic_dimension one_dyn_dim{1, 1, 0};
    std::transform(
        s0.dyn_dims().cbegin(),
        s0.dyn_dims().cend(),
@@ -92,7 +87,7 @@ std::vector<shape::dynamic_dimension> compute_broadcasted_dyn_dims(shape s0, sha
            {
                return a;
            }
-            else if(a == one_dyn_dim or b == one_dyn_dim)
+            else if(a == 1 or b == 1)
            {
                // setting opt to 0, may need to be changed
                return shape::dynamic_dimension{std::max(a.min, b.min), std::max(a.max, b.max), 0};

--- a/src/dead_code_elimination.cpp
+++ b/src/dead_code_elimination.cpp
@@ -51,8 +51,8 @@ void dead_code_elimination::apply(module& m) const
        // Skip instruction with empty shape as output unless its [dynamic, builtin, undefined,
        // identity, allocate]
        if((not i->get_shape().dynamic() and i->get_shape().elements() == 0) and
-           i->name().front() != '@' and
-           not contains({"undefined", "identity", "allocate"}, i->name()))
+           not(i->name().front() == '@') and not contains({"identity", "allocate"}, i->name()) and
+           not i->is_undefined())
            continue;
        assert(std::distance(m.begin(), i) <= std::distance(m.begin(), last));
        std::unordered_set<instruction_ref> visited;

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -109,8 +109,12 @@ struct loader
        ap(brief, {"--brief"}, ap.help("Make the output brief."), ap.set_value(true));
        ap(output_type,
           {"--cpp"},
-           ap.help("Print out the program as cpp program."),
+           ap.help("Print out the program as C++ program."),
           ap.set_value("cpp"));
+        ap(output_type,
+           {"--python", "--py"},
+           ap.help("Print out the program as python program."),
+           ap.set_value("py"));
        ap(output_type, {"--json"}, ap.help("Print out program as json."), ap.set_value("json"));
        ap(output_type,
           {"--text"},
@@ -259,7 +263,9 @@ struct loader
                type = "binary";
        }

-        if(type == "cpp")
+        if(type == "py")
+            p.print_py(*os);
+        else if(type == "cpp")
            p.print_cpp(*os);
        else if(type == "graphviz")
            p.print_graph(*os, brief);

--- a/src/eliminate_contiguous.cpp
+++ b/src/eliminate_contiguous.cpp
@@ -42,6 +42,13 @@ static bool try_compute_shape(instruction_ref ins,
    try
    {
        shape new_shape = ins->get_operator().compute_shape(inputs, mods);
+
+        // Cannot tell if a dynamic shape will need to be made contiguous
+        if(new_shape.dynamic())
+        {
+            return false;
+        }
+
        // If the output shape is a standard shape, no need to try its output
        if(new_shape.standard())
        {
@@ -133,14 +140,20 @@ static void remove_contiguous(const std::string& op_name, module& m, F f)
        }
    }

-    // Perform evaluations in parallel
+    // Perform static contiguous evaluations in parallel
    std::vector<argument> literals(const_instructions.size());
    par_for(const_instructions.size(), 1, [&](const auto i) {
-        auto c      = op::contiguous{};
-        auto prev   = const_instructions[i]->inputs().front();
-        literals[i] = c.compute(c.compute_shape({prev->get_shape()}), {prev->eval()});
+        auto c    = op::contiguous{};
+        auto prev = const_instructions[i]->inputs().front();
+        // compute the output contiguous shape from the previous instruction shape
+        shape computed_shape                   = c.compute_shape({prev->get_shape()});
+        const std::vector<argument>& prev_eval = {prev->eval()};
+        // prev_eval should not be used in make_compute_output_shape() as computed_shape is static
+        auto co_shape = make_compute_output_shape(pack(c, computed_shape, prev_eval));
+        literals[i]   = c.compute(co_shape, prev_eval);
    });

+    // Replace static contiguous operations with a literal
    for(size_t i = 0; i < const_instructions.size(); i++)
    {
        auto l = m.add_literal(literals[i].get_shape(), literals[i].data());

--- a/src/file_buffer.cpp
+++ b/src/file_buffer.cpp
@@ -30,23 +30,31 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

 template <class T>
-T generic_read_file(const std::string& filename)
+T generic_read_file(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
 {
    std::ifstream is(filename, std::ios::binary | std::ios::ate);
-    std::streamsize size = is.tellg();
-    if(size < 1)
+    if(nbytes == 0)
+    {
+        // if there is a non-zero offset and nbytes is not set,
+        // calculate size of remaining bytes to read
+        nbytes = is.tellg();
+        if(offset > nbytes)
+            MIGRAPHX_THROW("offset is larger than file size");
+        nbytes -= offset;
+    }
+    if(nbytes < 1)
        MIGRAPHX_THROW("Invalid size for: " + filename);
-    is.seekg(0, std::ios::beg);
+    is.seekg(offset, std::ios::beg);

-    T buffer(size, 0);
-    if(not is.read(&buffer[0], size))
+    T buffer(nbytes, 0);
+    if(not is.read(&buffer[0], nbytes))
        MIGRAPHX_THROW("Error reading file: " + filename);
    return buffer;
 }

-std::vector<char> read_buffer(const std::string& filename)
+std::vector<char> read_buffer(const std::string& filename, size_t offset, size_t nbytes)
 {
-    return generic_read_file<std::vector<char>>(filename);
+    return generic_read_file<std::vector<char>>(filename, offset, nbytes);
 }

 std::string read_string(const std::string& filename)

--- a/src/fuse_pointwise.cpp
+++ b/src/fuse_pointwise.cpp
@@ -45,7 +45,16 @@ static literal get_scalar(instruction_ref ins)
        return {};
    auto e = ins->eval();
    literal r{};
-    e.visit_at([&](auto x) { r = literal{x}; });
+    // needed for bool as visit_at invokes as() which promotes bool to int8
+    // Without this we'll break type checks for logical ops that are fused.
+    if(e.get_shape().type() == shape::bool_type)
+    {
+        r = literal{e.at<bool>()};
+    }
+    else
+    {
+        e.visit_at([&](auto x) { r = literal{x}; });
+    }
    return r;
 }

@@ -56,6 +65,8 @@ static void create_pointwise_modules(module_pass_manager& mpm)
    {
        if(not ins->get_operator().attributes().get("pointwise", false))
            continue;
+        if(ins->get_operator().name() == "layout")
+            continue;
        assert(ins->get_operator().attributes().contains("point_op"));
        auto* pm = mpm.create_module(mpm.get_module().name() + ":pointwise" + std::to_string(n++));
        pm->set_bypass();

--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_CHECK_SHAPES_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CHECK_SHAPES_HPP

+#include <migraphx/permutation.hpp>
 #include <migraphx/shape.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/stringutils.hpp>
@@ -197,7 +198,7 @@ struct check_shapes
     */
    const check_shapes& same_ndims() const
    {
-        if(not this->same([](const shape& s) { return s.max_lens().size(); }))
+        if(not this->same([](const shape& s) { return s.ndim(); }))
            MIGRAPHX_THROW(prefix() + "Number of dimensions do not match");
        return *this;
    }
@@ -232,6 +233,19 @@ struct check_shapes
        return *this;
    }

+    /*!
+     * Check all shapes are packed with certain layouts
+     */
+    const check_shapes&
+    packed_layouts(const std::initializer_list<std::vector<int64_t>>& layouts) const
+    {
+        if(not this->all_of([&](const shape& s) {
+               return s.packed() and contains(layouts, find_permutation(s));
+           }))
+            MIGRAPHX_THROW(prefix() + "Shapes are not packed with correct layout");
+        return *this;
+    }
+
    /*!
     * Check all shapes are packed or broadcasted.
     */

--- a/src/include/migraphx/file_buffer.hpp
+++ b/src/include/migraphx/file_buffer.hpp
@@ -31,7 +31,7 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-std::vector<char> read_buffer(const std::string& filename);
+std::vector<char> read_buffer(const std::string& filename, size_t offset = 0, size_t nbytes = 0);
 std::string read_string(const std::string& filename);

 void write_buffer(const std::string& filename, const char* buffer, std::size_t size);

--- a/src/include/migraphx/instruction.hpp
+++ b/src/include/migraphx/instruction.hpp
@@ -121,6 +121,8 @@ struct instruction

    bool can_eval() const;

+    bool is_undefined() const;
+
    argument eval(bool check_eval = true) const;

    void finalize(context& ctx);

--- a/src/include/migraphx/int_divide.hpp
+++ b/src/include/migraphx/int_divide.hpp
@@ -21,28 +21,27 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_INT_DIVIDE_HPP
-#define MIGRAPHX_GUARD_RTGLIB_INT_DIVIDE_HPP
+#ifndef MIGRAPHX_GUARD_MIGRAPHX_LAYOUT_NHWC_HPP
+#define MIGRAPHX_GUARD_MIGRAPHX_LAYOUT_NHWC_HPP

+#include <string>
+#include <migraphx/instruction_ref.hpp>
 #include <migraphx/config.hpp>
-#include <cmath>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-template <class R, class T, class U>
-R floor_divide(T x, U y)
-{
-    return R(std::floor(double(x) / double(y)));
-}
+struct module_pass_manager;

-template <class R, class T, class U>
-R ceil_divide(T x, U y)
+/**
+ * Transform convolutions to nhwc
+ */
+struct layout_nhwc
 {
-    return R(std::ceil(double(x) / double(y)));
-}
+    std::string name() const { return "layout_nhwc"; }
+    void apply(module_pass_manager& mpm) const;
+};

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
+#endif // MIGRAPHX_GUARD_MIGRAPHX_LAYOUT_NHWC_HPP
--- a/src/include/migraphx/literal.hpp
+++ b/src/include/migraphx/literal.hpp
@@ -80,6 +80,7 @@ struct literal : raw_data<literal>
        fill(start, end);
    }

+    // Directly copies buffer of x
    template <class T, MIGRAPHX_REQUIRES(sizeof(T) == 1)>
    literal(const shape& s, T* x) : buffer(make_shared_array<char>(s.bytes())), m_shape(s)
    {
@@ -107,25 +108,15 @@ struct literal : raw_data<literal>
    std::shared_ptr<char> buffer;
    shape m_shape;

+    // Keeps the same data ordering as the given container
    template <class Iterator>
    void fill(Iterator start, Iterator end)
    {
        assert(std::distance(start, end) == m_shape.elements());
-        if(m_shape.standard())
-        {
-            m_shape.visit_type([&](auto as) { std::copy(start, end, as.from(buffer.get())); });
-        }
-        else
-        {
-            auto it = start;
-            m_shape.visit_type([&](auto as) {
-                auto output = make_view(m_shape, as.from(buffer.get()));
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
-                    output(idx.begin(), idx.end()) = *it; // NOLINT(bugprone-signed-char-misuse)
-                    it++;
-                });
-            });
-        }
+        m_shape.visit_type([&](auto as) {
+            auto output = make_view(m_shape, as.from(buffer.get()));
+            std::copy(start, end, output.begin());
+        });
    }
 };


--- a/src/include/migraphx/module.hpp
+++ b/src/include/migraphx/module.hpp
@@ -205,6 +205,12 @@ struct module

    void print_graph(std::ostream& os, bool brief = false) const;

+    void print_py(std::ostream& os) const;
+    std::unordered_map<instruction_ref, std::string>
+    print_py(std::ostream& os,
+             const std::string& mname,
+             std::unordered_map<instruction_ref, std::string> names) const;
+
    void print_cpp(std::ostream& os) const;
    std::unordered_map<instruction_ref, std::string>
    print_cpp(std::ostream& os,