Merge branch 'gather_operator' into seq2seq_example

98cd353f · Shucai Xiao · 4702c17e · c21a1b28 · 98cd353f · 98cd353f
Commit 98cd353f authored Feb 22, 2019 by Shucai Xiao
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,6 +145,7 @@ rocm_create_package(
    DESCRIPTION "AMD's graph optimizer"
    MAINTAINER "Paul Fultz II <paul.fultz@amd.com>"
    LDCONFIG
+    PTH
    DEPENDS miopen-hip rocblas hip_hcc half
 )


--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -107,6 +107,10 @@ rocmtest tidy: rocmnode('rocmtest') { cmake_build ->
    stage('Clang Release') {
        cmake_build("hcc", "-DCMAKE_BUILD_TYPE=release")
    }
+
+    stage('Clang Release Python 3') {
+        cmake_build("hcc", "-DCMAKE_BUILD_TYPE=release -DPYTHON_EXECUTABLE=/usr/local/bin/python3")
+    }
 }, gcc5: rocmnode('rocmtest') { cmake_build ->
    stage('GCC 5 Debug') {
        cmake_build("g++-5", "-DCMAKE_BUILD_TYPE=debug")

--- a/requirements.txt
+++ b/requirements.txt
 google/protobuf -DCMAKE_POSITION_INDEPENDENT_CODE=On
-RadeonOpenCompute/rocm-cmake@ac45c6e --build
+RadeonOpenCompute/rocm-cmake@42f6740 --build
 ROCmSoftwarePlatform/rocBLAS@30a992ae02fda568688bcd190edd5e277d6674d9
 ROCmSoftwarePlatform/MIOpen@1.7.0
 blaze,https://bitbucket.org/blaze-lib/blaze/get/f0755dea0e03.tar.gz -X header -DHEADER_DIR=blaze

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -757,43 +757,81 @@ struct gather
        // negative axis means counting dimensions from back
        int axis_index = (axis < 0) ? (n_dim + axis) : axis;

-        auto type        = inputs[0].type();
-        lens[axis_index] = inputs[1].elements();
-
-        return {type, lens};
-    }
+        auto type = inputs[0].type();
+        lens.erase(lens.begin() + axis_index);
+        if(!inputs[1].scalar())
+        {
+            auto ind_lens = inputs[1].lens();
+            lens.insert(lens.begin() + axis_index, ind_lens.begin(), ind_lens.end());
+        }

-    template <class T>
-    void compute_index(const T& out_idx,
-                       const int axis_index,
-                       const std::vector<std::size_t>& vec_indices,
-                       const std::size_t max_dim,
-                       T& in_idx) const
-    {
-        in_idx          = out_idx;
-        std::size_t idx = vec_indices.at(out_idx[axis_index]);
-        if(idx >= max_dim)
+        // for scalar output
+        if(lens.size() == 0)
        {
-            MIGRAPHX_THROW("Gather: indices are out of range in input tensor");
+            return {type, {1}, {0}};
        }
-        in_idx[axis_index] = idx;
+
+        return {type, lens};
    }

+    // template <class T>
+    // void compute_index(const T& out_idx,
+    //                    const int axis_index,
+    //                    const std::vector<std::size_t>& vec_indices,
+    //                    const std::size_t max_dim,
+    //                    T& in_idx) const
+    // {
+    //     in_idx          = out_idx;
+    //     std::size_t idx = vec_indices.at(out_idx[axis_index]);
+    //     if(idx >= max_dim)
+    //     {
+    //         MIGRAPHX_THROW("Gather: indices are out of range in input tensor");
+    //     }
+    //     in_idx[axis_index] = idx;
+    // }
+
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        // negative axis means counting dimensions from back
-        int axis_index = (axis < 0) ? (output_shape.lens().size() + axis) : axis;
+        int axis_index = (axis < 0) ? (args[0].get_shape().lens().size() + axis) : axis;

        // max dimension in axis
-        std::size_t max_dim = args[0].get_shape().lens()[axis_index];
-        std::vector<std::size_t> vec_indices;
-        args[1].visit([&](auto indices) { vec_indices.assign(indices.begin(), indices.end()); });
-        visit_all(result, args[0])([&](auto output, auto input) {
-            std::vector<std::size_t> in_idx;
-            shape_for_each(output.get_shape(), [&](const auto& idx) {
-                this->compute_index(idx, axis_index, vec_indices, max_dim, in_idx);
-                output(idx.begin(), idx.end()) = input(in_idx.begin(), in_idx.end());
+        // std::size_t max_dim = args[0].get_shape().lens()[axis_index];
+        // std::vector<std::size_t> vec_indices;
+        // args[1].visit([&](auto indices) { vec_indices.assign(indices.begin(), indices.end()); });
+        visit_all(result, args[0])([&](auto output, auto data) {
+            args[1].visit([&](auto indices) {
+                if(indices.get_shape().scalar())
+                {
+                    if(output_shape.scalar())
+                    {
+                        output[0] = data[indices.front()];
+                    }
+                    else
+                    {
+                        shape_for_each(output.get_shape(), [&](const auto& out_idx) {
+                            auto data_idx = out_idx;
+                            data_idx.insert(data_idx.begin() + axis_index, indices.front());
+                            output(out_idx.begin(), out_idx.end()) =
+                                data(data_idx.begin(), data_idx.end());
+                        });
+                    }
+                }
+                else
+                {
+                    auto ind_lens = indices.get_shape().lens();
+                    shape_for_each(output.get_shape(), [&](const auto& out_idx) {
+                        auto data_idx = out_idx;
+                        auto start_it = data_idx.begin() + axis_index;
+                        auto end_it   = data_idx.begin() + axis_index + ind_lens.size();
+                        std::vector<std::size_t> ind_idx(start_it, end_it);
+                        data_idx.erase(start_it, end_it);
+                        data_idx.insert(start_it, indices(ind_idx.begin(), ind_idx.end()));
+                        output(out_idx.begin(), out_idx.end()) =
+                            data(data_idx.begin(), data_idx.end());
+                    });
+                }
            });
        });


--- a/src/include/migraphx/tensor_view.hpp
+++ b/src/include/migraphx/tensor_view.hpp
@@ -124,6 +124,8 @@ struct tensor_view
            return m_data + this->size();
    }

+    std::vector<T> to_vector() const { return std::vector<T>(this->begin(), this->end()); }
+
    friend std::ostream& operator<<(std::ostream& os, const tensor_view<T>& x)
    {
        if(!x.empty())

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -434,7 +434,15 @@ struct onnx_parser
                                   attribute_map attributes,
                                   const std::vector<instruction_ref>&)
    {
-        literal v = parse_value(attributes.at("value"));
+        literal v     = parse_value(attributes.at("value"));
+        auto dim_size = attributes.at("value").t().dims_size();
+        // if dim_size is 0, it is a scalar
+        if(dim_size == 0)
+        {
+            migraphx::shape scalar_shape{v.get_shape().type(), {1}, {0}};
+            return prog.add_literal(migraphx::literal{scalar_shape, v.data()});
+        }
+
        return prog.add_literal(v);
    }

@@ -461,6 +469,18 @@ struct onnx_parser
        {
            transb = parse_value(attributes.at("transB")).at<bool>();
        }
+
+        // beginning or end of both args have dimension 1, need to squeeze
+        // before calling gemm, then doing unsqueeze after getting results
+        std::size_t num_squeeze = args[0]->get_shape().lens().size();
+        if(num_squeeze > 2)
+        {
+            std::vector<int64_t> vec_axises(num_squeeze - 2);
+            std::iota(vec_axises.begin(), vec_axises.end(), 0);
+            args[0] = prog.add_instruction(op::squeeze{vec_axises}, args[0]);
+            args[1] = prog.add_instruction(op::squeeze{vec_axises}, args[1]);
+        }
+
        std::vector<int64_t> perm = {1, 0};
        auto l1 = (transa) ? prog.add_instruction(op::transpose{perm}, args[0]) : args[0];
        auto l2 = (transb) ? prog.add_instruction(op::transpose{perm}, args[1]) : args[1];
@@ -469,6 +489,13 @@ struct onnx_parser
            if(beta != 0.f)
            {
                auto l3 = prog.add_instruction(op::dot{alpha}, l1, l2);
+                if(num_squeeze > 2)
+                {
+                    std::vector<int64_t> vec_axises(num_squeeze - 2);
+                    std::iota(vec_axises.begin(), vec_axises.end(), 0);
+                    l3 = prog.add_instruction(op::unsqueeze{vec_axises}, l3);
+                }
+
                auto l4 = args[2];
                if(l4->get_shape().scalar()) // ignore args[2] (no C value added to alpha*A*B)
                    return l3;
@@ -481,7 +508,16 @@ struct onnx_parser
                return add_broadcastable_binary_op(l3, l4, op::add{});
            }
        }
-        return prog.add_instruction(op::dot{alpha, beta}, l1, l2);
+
+        auto dot_res = prog.add_instruction(op::dot{alpha, beta}, l1, l2);
+        if(num_squeeze > 2)
+        {
+            std::vector<int64_t> vec_axises(num_squeeze - 2);
+            std::iota(vec_axises.begin(), vec_axises.end(), 0);
+            dot_res = prog.add_instruction(op::unsqueeze{vec_axises}, dot_res);
+        }
+
+        return dot_res;
    }

    instruction_ref

--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -203,9 +203,8 @@ void memory_coloring_impl::rewrite()

            if(is_allocate(ins))
            {
-                assert(!ins->inputs().empty());
                p_program->replace_instruction(
-                    ins, op::load{ins->inputs().at(0)->get_shape(), offset}, scratch_param);
+                    ins, op::load{ins->get_shape(), offset}, scratch_param);
            }
            else if(is_literal(ins))
            {

--- a/src/py/CMakeLists.txt
+++ b/src/py/CMakeLists.txt

 option(MIGRAPHX_ENABLE_PYTHON "Enable python bindings" ON)
 if(MIGRAPHX_ENABLE_PYTHON)
-find_program(DEFAULT_PYTHON_EXE python)
-if(DEFAULT_PYTHON_EXE)
-set(PYTHON_EXECUTABLE ${DEFAULT_PYTHON_EXE} CACHE PATH "Path to python executable")
-endif()
-find_package(pybind11 REQUIRED)
-pybind11_add_module(migraphx_py migraphx_py.cpp)
-set_target_properties(migraphx_py PROPERTIES 
-    OUTPUT_NAME migraphx
-    C_VISIBILITY_PRESET hidden
-    CXX_VISIBILITY_PRESET hidden
-)
-target_link_libraries(migraphx_py PRIVATE migraphx migraphx_onnx migraphx_cpu)
-if(MIGRAPHX_ENABLE_GPU)
-target_link_libraries(migraphx_py PRIVATE migraphx_gpu)
-target_compile_definitions(migraphx_py PRIVATE -DHAVE_GPU)
-endif()
+    find_program(DEFAULT_PYTHON_EXE python)
+    if(DEFAULT_PYTHON_EXE)
+        set(PYTHON_EXECUTABLE ${DEFAULT_PYTHON_EXE} CACHE PATH "Path to python executable")
+    endif()
+    find_package(pybind11 REQUIRED)
+    pybind11_add_module(migraphx_py migraphx_py.cpp)
+    set_target_properties(migraphx_py PROPERTIES 
+        OUTPUT_NAME migraphx
+        C_VISIBILITY_PRESET hidden
+        CXX_VISIBILITY_PRESET hidden
+    )
+    target_link_libraries(migraphx_py PRIVATE migraphx migraphx_onnx migraphx_cpu)
+    if(MIGRAPHX_ENABLE_GPU)
+        target_link_libraries(migraphx_py PRIVATE migraphx_gpu)
+        target_compile_definitions(migraphx_py PRIVATE -DHAVE_GPU)
+    endif()
+    rocm_install_targets(TARGETS migraphx_py)
 endif()
--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -28,6 +28,11 @@ struct throw_half
    {
        throw std::runtime_error("Half not supported in python yet.");
    }
+
+    void operator()(migraphx::tensor_view<migraphx::half>) const
+    {
+        throw std::runtime_error("Half not supported in python yet.");
+    }
 };

 template <class F>
@@ -42,6 +47,8 @@ struct skip_half
    }

    void operator()(migraphx::shape::as<migraphx::half>) const {}
+
+    void operator()(migraphx::tensor_view<migraphx::half>) const {}
 };

 template <class F>
@@ -50,6 +57,12 @@ void visit_type(const migraphx::shape& s, F f)
    s.visit_type(throw_half<F>{f});
 }

+template <class T, class F>
+void visit(const migraphx::raw_data<T>& x, F f)
+{
+    x.visit(throw_half<F>{f});
+}
+
 template <class F>
 void visit_types(F f)
 {
@@ -60,6 +73,9 @@ template <class T>
 py::buffer_info to_buffer_info(T& x)
 {
    migraphx::shape s = x.get_shape();
+    auto strides      = s.strides();
+    std::transform(
+        strides.begin(), strides.end(), strides.begin(), [&](auto i) { return i * s.type_size(); });
    py::buffer_info b;
    visit_type(s, [&](auto as) {
        b = py::buffer_info(x.data(),
@@ -67,7 +83,7 @@ py::buffer_info to_buffer_info(T& x)
                            py::format_descriptor<decltype(as())>::format(),
                            s.lens().size(),
                            s.lens(),
-                            s.strides());
+                            strides);
    });
    return b;
 }
@@ -75,11 +91,20 @@ py::buffer_info to_buffer_info(T& x)
 migraphx::shape to_shape(const py::buffer_info& info)
 {
    migraphx::shape::type_t t;
+    std::size_t n = 0;
    visit_types([&](auto as) {
        if(info.format == py::format_descriptor<decltype(as())>::format())
+        {
            t = as.type_enum();
+            n = sizeof(as());
+        }
+
    });
-    return migraphx::shape{t, info.shape, info.strides};
+    auto strides = info.strides;
+    std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto i) -> std::size_t {
+        return n > 0 ? i / n : 0;
+    });
+    return migraphx::shape{t, info.shape, strides};
 }

 PYBIND11_MODULE(migraphx, m)
@@ -108,6 +133,13 @@ PYBIND11_MODULE(migraphx, m)
                 py::buffer_info info = b.request();
                 new(&x) migraphx::argument(to_shape(info), info.ptr);
             })
+        .def("get_shape", &migraphx::argument::get_shape)
+        .def("tolist",
+             [](migraphx::argument& x) {
+                 py::list l{x.get_shape().elements()};
+                 visit(x, [&](auto data) { l = py::cast(data.to_vector()); });
+                 return l;
+             })
        .def("__eq__", std::equal_to<migraphx::argument>{})
        .def("__ne__", std::not_equal_to<migraphx::argument>{})
        .def("__repr__", [](const migraphx::argument& x) { return migraphx::to_string(x); });

--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
@@ -16,22 +16,35 @@ argument gather(hipStream_t stream,
                std::vector<migraphx::argument> args,
                int axis)
 {
-    int axis_index = (axis < 0) ? (axis + output_shape.lens().size()) : axis;
+    int axis_index = (axis < 0) ? (axis + args[0].get_shape().lens().size()) : axis;
    visit_all(args.back(), args[0])([&](auto output, auto input) {
        std::size_t nelements = output_shape.elements();
        args[1].visit([&](auto indices) {
-            visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
-                const auto* indices_ptr = device_cast(indices.data());
-                auto* outptr            = device_cast(output.data());
-                const auto* inptr       = device_cast(input.data());
-                hip_tensor_descriptor<ndim> desc_input(input.get_shape());
-                hip_tensor_descriptor<ndim> desc_output(output.get_shape());
-                gs_launch(stream, nelements)([=](auto i) {
-                    auto lens        = desc_output.multi(i);
-                    lens[axis_index] = indices_ptr[lens[axis_index]];
-                    outptr[i]        = inptr[desc_input.linear(lens)];
+            const auto* indices_ptr = device_cast(indices.data());
+            auto* out_ptr           = device_cast(output.data());
+            const auto* in_ptr      = device_cast(input.data());
+            if(output_shape.scalar())
+            {
+                gs_launch(stream, 1)(
+                    [=](auto i) { out_ptr[i] = in_ptr[static_cast<int>(indices_ptr[0])]; });
+            }
+            else
+            {
+                // if indices are a scalar, output has one dim smaller than input
+                auto& input_shape = args[0].get_shape();
+                auto lens         = input_shape.lens();
+                lens[axis_index]  = args[1].get_shape().elements();
+                migraphx::shape out_comp_shape{output_shape.type(), lens};
+                visit_tensor_size(out_comp_shape.lens().size(), [&](auto n_out_dim) {
+                    hip_tensor_descriptor<n_out_dim> desc_input(input_shape);
+                    hip_tensor_descriptor<n_out_dim> desc_output(out_comp_shape);
+                    gs_launch(stream, nelements)([=](auto ii) {
+                        auto in_idx        = desc_output.multi(ii);
+                        in_idx[axis_index] = indices_ptr[in_idx[axis_index]];
+                        out_ptr[ii]        = in_ptr[desc_input.linear(in_idx)];
+                    });
                });
-            });
+            }
        });
    });


--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -11,7 +11,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_NULL_STREAM)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NULL_STREAM)

 struct hip_device
 {
@@ -40,7 +40,7 @@ struct hip_device

        hipStream_t get()
        {
-            if(enabled(MIGRAPHX_DISABLE_NULL_STREAM{}))
+            if(not enabled(MIGRAPHX_ENABLE_NULL_STREAM{}))
            {
                setup();
                if(s == nullptr)
@@ -53,7 +53,7 @@ struct hip_device

        auto create_miopen_handle()
        {
-            if(enabled(MIGRAPHX_DISABLE_NULL_STREAM{}))
+            if(not enabled(MIGRAPHX_ENABLE_NULL_STREAM{}))
                return make_obj<miopen_handle>(&miopenCreateWithStream, get());
            else
                return make_obj<miopen_handle>(&miopenCreate);

--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -23,12 +23,13 @@ void copy_to_gpu(const argument& src, const argument& dst);

 struct hip_allocate
 {
+    shape s;
    std::string tag{};
    std::string name() const { return "hip::allocate"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
-        check_shapes{inputs}.has(1);
-        return inputs.front();
+        check_shapes{inputs}.has(0);
+        return s;
    }
    argument compute(context&, const shape& output_shape, const std::vector<argument>&) const
    {

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -127,8 +127,7 @@ struct miopen_apply
        }
        else
        {
-            auto is     = prog->add_outline(s);
-            auto result = prog->insert_instruction(ins, hip_allocate{std::move(tag)}, is);
+            auto result = prog->insert_instruction(ins, hip_allocate{s, std::move(tag)});
            return result;
        }
    }

--- a/src/targets/gpu/write_literals.cpp
+++ b/src/targets/gpu/write_literals.cpp
@@ -37,8 +37,7 @@ void write_literals::apply(program& p) const
            {
                literal l  = ins->get_literal();
                auto pre   = p.add_literal(l);
-                auto s     = p.add_outline(l.get_shape());
-                auto alloc = p.insert_instruction(std::next(pre), hip_allocate{}, s);
+                auto alloc = p.insert_instruction(std::next(pre), hip_allocate{l.get_shape()});
                p.replace_instruction(ins, hip_copy{}, pre, alloc);
            }
            else

--- a/test/memory_coloring_test.cpp
+++ b/test/memory_coloring_test.cpp
@@ -21,8 +21,8 @@ struct allocate
    std::string name() const { return "allocate"; }
    migraphx::shape compute_shape(const std::vector<migraphx::shape>& inputs) const
    {
-        migraphx::check_shapes{inputs, *this}.has(1);
-        return inputs.front();
+        migraphx::check_shapes{inputs, *this}.has(0);
+        return s;
    }
    migraphx::argument compute(migraphx::context&,
                               const migraphx::shape& output_shape,
@@ -34,8 +34,7 @@ struct allocate

 migraphx::instruction_ref add_alloc(migraphx::program& p, const migraphx::shape& s)
 {
-    auto a0 = p.add_outline(s);
-    return p.add_instruction(allocate{}, a0);
+    return p.add_instruction(allocate{s});
 }

 bool no_allocate(const migraphx::program& p)

--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -235,7 +235,7 @@ TEST_CASE(gather)
        migraphx::shape input{migraphx::shape::float_type, {2, 3, 4, 5}};
        migraphx::shape indices{migraphx::shape::int32_type, {2, 3}};
        int axis = 1;
-        expect_shape(migraphx::shape{migraphx::shape::float_type, {2, 6, 4, 5}},
+        expect_shape(migraphx::shape{migraphx::shape::float_type, {2, 2, 3, 4, 5}},
                     migraphx::op::gather{axis},
                     input,
                     indices);
@@ -245,7 +245,7 @@ TEST_CASE(gather)
        migraphx::shape input{migraphx::shape::float_type, {2, 3, 4, 5}};
        migraphx::shape indices{migraphx::shape::int32_type, {2, 3}};
        int axis = -4;
-        expect_shape(migraphx::shape{migraphx::shape::float_type, {6, 3, 4, 5}},
+        expect_shape(migraphx::shape{migraphx::shape::float_type, {2, 3, 3, 4, 5}},
                     migraphx::op::gather{axis},
                     input,
                     indices);

--- a/test/py/CMakeLists.txt
+++ b/test/py/CMakeLists.txt
@@ -4,6 +4,8 @@ find_package(PythonInterp)
 function(add_py_test NAME SCRIPT)
    set (ENV_COMMAND ${CMAKE_COMMAND} -E env 
        "PYTHONPATH=$<TARGET_FILE_DIR:migraphx_py>"
+        "PYTHONMALLOC=debug"
+        "MALLOC_CHECK_=3"
    )
    add_test(
        NAME test_py_${NAME} 
@@ -15,7 +17,8 @@ endfunction()
 add_dependencies(tests migraphx_py)
 add_dependencies(check migraphx_py)

-add_py_test(cpu cpu.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
+add_py_test(cpu test_cpu.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
 if(MIGRAPHX_ENABLE_GPU)
-add_py_test(gpu gpu.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
+add_py_test(gpu test_gpu.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
+add_py_test(array test_array.py WORKING_DIRECTORY ${TEST_ONNX_DIR})
 endif()
--- a/test/py/test_array.py
+++ b/test/py/test_array.py
+import migraphx, struct, array, sys
+try:
+    from functools import reduce
+except:
+    pass
+
+def assert_eq(x, y):
+    if x == y:
+        pass
+    else:
+        raise Exception(str(x) + " != " + str(y))
+
+def read_float(b, index):
+    return struct.unpack_from('f', b, index*4)[0]
+
+def write_float(b, index):
+    struct.pack_into('f', b, index*4)
+
+def nelements(lens):
+    return reduce(lambda x,y: x*y,lens, 1)
+
+def create_buffer(t, data, shape):
+    a = array.array(t, data)
+    if sys.version_info >= (3, 0):
+        m = memoryview(a.tobytes())
+        return m.cast(t, shape)
+    else:
+        m = memoryview(a.tostring())
+        return m
+
+def check_argument(a):
+    l = a.tolist()
+    for i in range(len(l)):
+        assert_eq(l[i], read_float(a, i))
+
+def check_shapes(r, m):
+    lens = list(m.shape)
+    strides = [int(s/m.itemsize) for s in m.strides]
+    elements = nelements(lens)
+    assert_eq(r.get_shape().elements(), elements)
+    assert_eq(r.get_shape().lens(), lens)
+    assert_eq(r.get_shape().strides(), strides)
+
+def run(p):
+    params = {}
+    for key, value in p.get_parameter_shapes().items():
+        params[key] = migraphx.to_gpu(migraphx.generate_argument(value))
+
+    return migraphx.from_gpu(p.run(params))
+
+def test_shape(shape):
+    data = list(range(nelements(shape)))
+    m = create_buffer('f', data, shape)
+    a = migraphx.argument(m)
+    check_shapes(a, m)
+    assert_eq(a.tolist(), data)
+
+def test_input():
+    if sys.version_info >= (3, 0):
+        test_shape([4])
+        test_shape([2, 3])
+    else:
+        data = list(range(4))
+        m = create_buffer('f', data, [4])
+        a1 = migraphx.argument(m)
+        a2 = migraphx.argument(bytearray(a1))
+        check_shapes(a2, m)
+        assert_eq(a1.tolist(), m.tolist())
+
+
+def test_output():
+    p = migraphx.parse_onnx("conv_relu_maxpool.onnx")
+    p.compile(migraphx.get_target("gpu"))
+
+    r1 = run(p)
+    r2 = run(p)
+    assert_eq(r1, r2)
+    assert_eq(r1.tolist(), r2.tolist())
+
+    check_argument(r1)
+    check_argument(r2)
+
+    m1 = memoryview(r1)
+    m2 = memoryview(r2)
+
+    check_shapes(r1, m1)
+    check_shapes(r2, m2)
+
+
+test_input()
+test_output()
--- a/test/py/cpu.py
+++ b/test/py/cpu.py
--- a/test/py/gpu.py
+++ b/test/py/gpu.py