temp changes.

03afa098 · Shucai Xiao · 9e0dca3d · 03afa098 · 03afa098 · 03afa098
Commit 03afa098 authored May 09, 2019 by Shucai Xiao
6 changed files
--- a/src/include/migraphx/argument.hpp
+++ b/src/include/migraphx/argument.hpp
@@ -36,7 +36,7 @@ struct argument : raw_data<argument>
    }

    /// Provides a raw pointer to the data
-    std::function<char*()> data;
+    std::function<char*()> data = nullptr;

    /// Whether data is available
    bool empty() const { return not data; }

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -70,7 +70,6 @@ add_library(migraphx_gpu
    lrn.cpp
    schedule_model.cpp
    adjust_allocation.cpp
-    pack_int8_args.cpp
    clip.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)

--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
@@ -61,6 +61,11 @@ struct hip_tensor_descriptor
    {
        std::copy(s.lens().begin(), s.lens().end(), lens);
        std::copy(s.strides().begin(), s.strides().end(), strides);
+        indices.resize(s.strides().size());
+        std::iota(indices.begin(), indices.end(), 0);
+        std::sort(indices.begin(), indices.end(), [&](size_t i1, size_t i2) {
+            return strides[i1] > strides[i2];
+        });
    }

    __device__ __host__ hip_index<NDim> multi(size_t idx) const
@@ -69,8 +74,8 @@ struct hip_tensor_descriptor
        size_t tidx = idx;
        for(size_t is = 0; is < NDim; is++)
        {
-            result[is] = tidx / strides[is];
-            tidx       = tidx % strides[is];
+            result[indices[is]] = tidx / strides[indices[is]];
+            tidx       = tidx % strides[indices[is]];
        }
        return result;
    }
@@ -83,6 +88,7 @@ struct hip_tensor_descriptor
    }
    size_t lens[NDim]    = {};
    size_t strides[NDim] = {};
+    std::vector<size_t> indices{};
 };

 } // namespace device

--- a/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
@@ -13,6 +13,8 @@ struct context;
 struct miopen_quant_gemm
 {
    op::quant_dot op;
+    mutable argument pack_0{};
+    mutable argument pack_1{};

    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -30,17 +32,17 @@ struct miopen_quant_gemm
    }
 };

-struct hip_pack
-{
-    std::string name() const { return "gpu::gemm_pack"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
+// struct hip_pack
+// {
+//     std::string name() const { return "gpu::gemm_pack"; }
+//     shape compute_shape(const std::vector<shape>& inputs) const;
+//     argument
+//     compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+//     std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+//     {
+//         return shapes.size() - 1;
+//     }
+// };

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/quant_gemm.cpp
+++ b/src/targets/gpu/quant_gemm.cpp
 #include <migraphx/gpu/quant_gemm.hpp>
 #include <migraphx/gpu/device/pack.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/generate.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -54,15 +55,21 @@ rb_type<T>* to_rocblas_type(T* x)
 shape miopen_quant_gemm::compute_shape(const std::vector<shape>& inputs) const
 {
    std::vector<shape> input_shapes(inputs);
-    if(!inputs.at(1).transposed())
-    {
-        input_shapes.pop_back();
-    }
-    if(inputs.at(0).transposed())
-    {
-        input_shapes.pop_back();
-    }
    input_shapes.pop_back();
+    // if(!inputs.at(1).transposed())
+    // {
+    //     if (pack_1.empty())
+    //     {
+    //         pack_1 = allocate_gpu(inputs.at(1));
+    //     }
+    // }
+    // if(inputs.at(0).transposed())
+    // {
+    //     if (pack_0.empty())
+    //     {
+    //         pack_0 = allocate_gpu(inputs.at(0));
+    //     }
+    // }

    check_shapes{input_shapes}.not_broadcasted();
    return op.compute_shape(input_shapes);
@@ -82,26 +89,37 @@ argument miopen_quant_gemm::compute(context& ctx,
    rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
    rocblas_int ldc = args[2].get_shape().strides()[dim_0];

-    size_t addi_ref_num = 0;
    if(!transb)
    {
-        ++addi_ref_num;
-        const argument& arg_b = args[args.size() - 1];
-        // argument for B is the last one in the input argument vector
        // use the algorithm to pack A
-        device::pack_a(ctx.get_stream().get(), args[1], arg_b);
+        if (pack_1.empty())
+        {
+            std::cout << "allocate pack_1" << std::endl;
+            pack_1 = allocate_gpu(args.at(1).get_shape());
+        }
+        //assert(!pack_1.empty());
+        device::pack_a(ctx.get_stream().get(), pack_1, args[1]);
+        auto pb = from_gpu(pack_1);
+        std::cout << "pb = " << pb << std::endl;
    }

    // need to pack A in this scenario, use the algorithm to pack B in the
    // comment of the API
    if(transa)
    {
-        ++addi_ref_num;
-        const argument& arg_a = args[args.size() - 1 - addi_ref_num];
-        device::pack_b(ctx.get_stream().get(), args[0], arg_a);
+        if (pack_0.empty())
+        {
+            std::cout << "allocate pack_0" << std::endl;
+            pack_0 = allocate_gpu(args.at(0).get_shape());
+        }
+        device::pack_b(ctx.get_stream().get(), pack_0, args[0]);
+        auto a = from_gpu(args[0]);
+        auto pa = from_gpu(pack_0);
+        std::cout << "a = " << a << std::endl;
+        std::cout << "pa = " << pa << std::endl;
    }

-    bool is_3inputs = (args.size() - addi_ref_num == 4);
+    bool is_3inputs = (args.size() == 4);
    int8_t beta     = 0;
    if(is_3inputs)
    {
@@ -135,10 +153,10 @@ argument miopen_quant_gemm::compute(context& ctx,
                                    m,
                                    k,
                                    &alpha_r,
-                                    to_pointer(args[1]),
+                                    (!transb) ? to_pointer(pack_1) : to_pointer(args[1]),
                                    rocblas_datatype_i8_r,
                                    ldb,
-                                    to_pointer(args[0]),
+                                    transa ? to_pointer(pack_0) : to_pointer(args[0]),
                                    rocblas_datatype_i8_r,
                                    lda,
                                    &beta_r,
@@ -165,11 +183,11 @@ argument miopen_quant_gemm::compute(context& ctx,
                m,
                k,
                &alpha_r,
-                to_pointer(args[1]),
+                (!transb) ? to_pointer(pack_1) : to_pointer(args[1]),
                rocblas_datatype_i8_r,
                ldb,
                k * n,
-                to_pointer(args[0]),
+                transa ? to_pointer(pack_0) : to_pointer(args[0]),
                rocblas_datatype_i8_r,
                lda,
                m * k,

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -71,8 +71,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx) const
        eliminate_allocation{"hip::allocate"},
        check_context<context>{},
        dead_code_elimination{},
-        pack_int8_args{},
-        dead_code_elimination{},
+        // pack_int8_args{},
+        // dead_code_elimination{},
        eliminate_identity{}
    };
    // clang-format on