Merge branch 'op_capture' into int8_quantize

1b4216ca · Shucai Xiao · 0e7e27cc · eab3cafb · 1b4216ca · 1b4216ca
Commit 1b4216ca authored Aug 06, 2019 by Shucai Xiao
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,9 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    endif()
 endif()
-if(CMAKE_CXX_COMPILER MATCHES ".*hcc")
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("--cuda-host-only -x hip" HAS_HIP)
+if(HAS_HIP)
    message(STATUS "Enable miopen backend")
    set(MIGRAPHX_ENABLE_GPU On CACHE BOOL "")
 else()

--- a/src/include/migraphx/generate.hpp
+++ b/src/include/migraphx/generate.hpp
@@ -82,8 +82,6 @@ std::vector<T> generate_tensor_data(const migraphx::shape& s, unsigned long seed
 {
    std::vector<T> result(s.elements());
    std::generate(result.begin(), result.end(), xorshf96_generator<T>{seed});
-    // divide a value to avoid integer overflow
-    std::transform(result.begin(), result.end(), result.begin(), [](auto i) { return i / 32; });
    // std::generate(result.begin(), result.end(), [&]{ return seed % 7; });
    // std::generate(result.begin(), result.end(), []{ return 1; });
    return result;

--- a/src/include/migraphx/op/convert.hpp
+++ b/src/include/migraphx/op/convert.hpp
@@ -20,14 +20,11 @@ namespace op {
 struct convert : unary<convert>
 {
    shape::type_t target_type = shape::half_type;
-    float scale               = 1.0f;
-    float shift               = 0.0f;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(
+        return pack(f(self.target_type, "target_type"));
-            f(self.target_type, "target_type"), f(self.scale, "scale"), f(self.shift, "shift"));
    }
    shape compute_shape(std::vector<shape> inputs) const
@@ -38,22 +35,10 @@ struct convert : unary<convert>
    auto apply() const
    {
-        return [&](auto x) {
+        return [](auto x) { return x; };
-            float res = scale * x + shift;
-            if(target_type == shape::int8_type)
-            {
-                int factor = (res >= 0.0f) ? 1 : -1;
-                res        = res + factor * 0.5f;
-                res        = res > 127.0f ? 127.0f : res;
-                res        = res < -128.0f ? -128.0f : res;
-            }
-            return res;
-        };
    }
    convert(shape::type_t t) : target_type{t} {}
-    convert(shape::type_t t, float sle, float sft) : target_type{t}, scale{sle}, shift{sft} {}
    convert() {}
 };

--- a/src/include/migraphx/quantization.hpp
+++ b/src/include/migraphx/quantization.hpp
@@ -19,7 +19,7 @@ void quantize(program& prog);
 // to int8
 void capture_arguments(program& prog,
                       const std::vector<std::string>& ins_names,
-                       std::function<void(std::size_t, std::vector<argument>)> func);
+                       const std::function<void(std::size_t, std::vector<argument>)>& func);
 void capture_arguments(program& prog, const std::vector<std::string>& ins_names);
 void capture_arguments(program& prog);

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -23,9 +23,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 instruction_ref insert_quant_ins(program& prog,
                                 instruction_ref& ins,
                                 shape::type_t type,
-                                 std::unordered_map<instruction_ref, instruction_ref>& map_ins,
+                                 std::unordered_map<instruction_ref, instruction_ref>& map_ins)
-                                 float scale = 1.0f,
-                                 float shift = 0.0f)
 {
    if(map_ins.count(ins) > 0)
    {
@@ -37,16 +35,11 @@ instruction_ref insert_quant_ins(program& prog,
        return ins;
    }
-    if(scale < 0.0f)
-    {
-        MIGRAPHX_THROW("INSERT_QUANT_INS: scale less than 0");
-    }
    assert(ins->get_shape().type() == shape::float_type ||
           ins->get_shape().type() == shape::double_type ||
           ins->get_shape().type() == shape::int32_type);
    instruction_ref quant_ins{};
-    quant_ins    = prog.insert_instruction(std::next(ins), op::convert{type, scale, shift}, ins);
+    quant_ins    = prog.insert_instruction(std::next(ins), op::convert{type}, ins);
    map_ins[ins] = quant_ins;
    return quant_ins;
@@ -337,7 +330,7 @@ void quantize_int8(program& prog)
 // capture operator to compute the scale and shift
 void capture_arguments(program& prog,
                       const std::vector<std::string>& ins_names,
-                       std::function<void(std::size_t, std::vector<argument>)> func)
+                       const std::function<void(std::size_t, std::vector<argument>)>& func)
 {
    size_t num_quant_params = 0;

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -37,7 +37,7 @@ add_library(migraphx_device
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
-    device/pack.cpp
+    device/int8_gemm_pack.cpp
    device/div.cpp
    device/clip.cpp
    device/reduce_sum.cpp
@@ -85,9 +85,12 @@ add_library(migraphx_gpu
    lrn.cpp
    schedule_model.cpp
    adjust_allocation.cpp
+    pack_int8_args.cpp
    clip.cpp
    reduce_sum.cpp
    reduce_mean.cpp
+    int8_gemm_pack.cpp
+    int8_conv_pack.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/convert.cpp
+++ b/src/targets/gpu/convert.cpp
@@ -15,7 +15,7 @@ shape hip_convert::compute_shape(std::vector<shape> inputs) const
 argument hip_convert::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    device::convert(ctx.get_stream().get(), args[1], args[0], op.scale, op.shift, op.target_type);
+    device::convert(ctx.get_stream().get(), args[1], args[0]);
    return args[1];
 }

--- a/src/targets/gpu/device/convert.cpp
+++ b/src/targets/gpu/device/convert.cpp
@@ -6,31 +6,14 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void convert(hipStream_t stream,
+void convert(hipStream_t stream, const argument& result, const argument& arg)
-             const argument& result,
-             const argument& arg,
-             float scale,
-             float shift,
-             shape::type_t target_type)
 {
    result.visit([&](auto output) {
        arg.visit([&](auto input) {
            const auto* input_ptr = device_cast(input.data());
            auto* output_ptr      = device_cast(output.data());
-            if(target_type == shape::int8_type)
+            gs_launch(stream,
-            {
+                      result.get_shape().elements())([=](auto i) { output_ptr[i] = input_ptr[i]; });
-                gs_launch(stream, result.get_shape().elements())([=](auto i) {
-                    float res     = input_ptr[i] * scale + shift;
-                    int factor    = (res >= 0.0f) ? 1 : -1;
-                    output_ptr[i] = static_cast<int8_t>(
-                        std::min<float>(std::max<float>(-128.0f, res + factor * 0.5), 127.0f));
-                });
-            }
-            else
-            {
-                gs_launch(stream, result.get_shape().elements())(
-                    [=](auto i) { output_ptr[i] = input_ptr[i] * scale + shift; });
-            }
        });
    });
 }

--- a/src/targets/gpu/device/pack.cpp
+++ b/src/targets/gpu/device/pack.cpp
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
-#include <migraphx/gpu/device/pack.hpp>
+#include <migraphx/gpu/device/int8_gemm_pack.hpp>
-#include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/hip.hpp>
 namespace migraphx {
@@ -11,7 +11,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void pack_a(hipStream_t stream, const argument& result, const argument& arg)
+void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg)
 {
    auto comp_shape    = arg.get_shape();
    auto out_lens      = comp_shape.lens();
@@ -38,7 +38,7 @@ void pack_a(hipStream_t stream, const argument& result, const argument& arg)
    });
 }
-void pack_b(hipStream_t stream, const argument& result, const argument& arg)
+void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg)
 {
    auto trans_shape = arg.get_shape();
    auto out_lens    = trans_shape.lens();

--- a/src/targets/gpu/include/migraphx/gpu/device/convert.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/convert.hpp
@@ -11,12 +11,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void convert(hipStream_t stream,
+void convert(hipStream_t stream, const argument& result, const argument& arg);
-             const argument& result,
-             const argument& arg,
-             float scale,
-             float shift,
-             shape::type_t target_type);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/pack.hpp
-#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
 #include <migraphx/argument.hpp>
 #include <migraphx/config.hpp>
@@ -10,11 +10,9 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void pack_a(hipStream_t stream, const argument& result, const argument& arg);
+void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg);
-void pack_b(hipStream_t stream, const argument& result, const argument& arg);
+void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg);
-void sync_stream(hipStream_t stream);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/config.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct miopen_int8_conv_pack
+{
+    std::string name() const { return "gpu::int8_conv_pack"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/config.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_int8_gemm_pack_a
+{
+    std::string name() const { return "gpu::int8_gemm_pack_a"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+struct hip_int8_gemm_pack_b
+{
+    std::string name() const { return "gpu::int8_gemm_pack_b"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_PACK_INT8_ARGS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_PACK_INT8_ARGS_HPP
+#include <migraphx/program.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct pack_int8_args
+{
+    std::string name() const { return "gpu::pack_int8_args"; }
+    void apply(program& p) const;
+    shape pack_int8_shape(const shape& s) const;
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -17,8 +17,6 @@ struct miopen_quant_convolution
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
    miopenHandle_t handle = nullptr;
-    argument arg_vec4_x{};
-    argument arg_vec4_w{};
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -39,7 +37,7 @@ struct miopen_quant_convolution
    }
    private:
-    shape pack_int8_shape(shape& s);
+    shape pack_int8_shape(const shape& s) const;
 };
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
@@ -10,7 +10,7 @@ namespace gpu {
 struct context;
-struct miopen_quant_gemm
+struct rocblas_quant_gemm
 {
    op::quant_dot op;
@@ -24,6 +24,7 @@ struct miopen_quant_gemm
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    void batch_not_transposed(const std::vector<std::size_t>& strides) const;
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;

--- a/src/targets/gpu/int8_conv_pack.cpp
+++ b/src/targets/gpu/int8_conv_pack.cpp
+#include <migraphx/gpu/int8_conv_pack.hpp>
+#include <migraphx/gpu/context.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape miopen_int8_conv_pack::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{{inputs.at(0)}, *this}.has(1).standard();
+    return inputs.at(0);
+}
+argument
+miopen_int8_conv_pack::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    auto arg_desc      = make_tensor(args[0].get_shape());
+    auto arg_desc_vec4 = make_tensor(args[0].get_shape(), true);
+    float alpha = 1;
+    float beta  = 0;
+    // pack input to vec4 format
+    auto status = miopenTransformTensor(ctx.get_stream().get_miopen(),
+                                        &alpha,
+                                        arg_desc.get(),
+                                        args[0].implicit(),
+                                        &beta,
+                                        arg_desc_vec4.get(),
+                                        args[1].implicit());
+    if(status != miopenStatusSuccess)
+    {
+        MIGRAPHX_THROW("INT8_CONV_PACK: transform input tensor failed");
+    }
+    return args[1];
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/int8_gemm_pack.cpp
+++ b/src/targets/gpu/int8_gemm_pack.cpp
+#include <migraphx/gpu/int8_gemm_pack.hpp>
+#include <migraphx/gpu/device/int8_gemm_pack.hpp>
+#include <migraphx/gpu/context.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_int8_gemm_pack_a::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{{inputs.at(0)}, *this}.has(1).not_broadcasted().packed();
+    return inputs.at(0);
+}
+argument
+hip_int8_gemm_pack_a::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::int8_gemm_pack_a(ctx.get_stream().get(), args[1], args[0]);
+    return args[1];
+}
+shape hip_int8_gemm_pack_b::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{{inputs.at(0)}, *this}.has(1).not_broadcasted().packed();
+    return inputs.at(0);
+}
+argument
+hip_int8_gemm_pack_b::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::int8_gemm_pack_b(ctx.get_stream().get(), args[1], args[0]);
+    return args[1];
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -59,6 +59,7 @@
 #include <migraphx/gpu/reduce_mean.hpp>
 #include <migraphx/gpu/pow.hpp>
 #include <migraphx/gpu/sqdiff.hpp>
+#include <migraphx/gpu/int8_conv_pack.hpp>
 #include <utility>
 #include <functional>
 #include <algorithm>
@@ -115,6 +116,7 @@ struct miopen_apply
        add_generic_op<hip_sqdiff>("sqdiff");
        add_extend_op<miopen_gemm, op::dot>("dot");
+        add_extend_op<rocblas_quant_gemm, op::quant_dot>("quant_dot");
        add_extend_op<miopen_contiguous, op::contiguous>("contiguous");
        add_extend_op<hip_concat, op::concat>("concat");
        add_extend_op<hip_softmax, op::softmax>("softmax");
@@ -131,7 +133,7 @@ struct miopen_apply
        add_lrn_op();
        add_convolution_op();
        add_quant_convolution_op();
-        add_quant_dot_op();
+        // add_quant_dot_op();
        add_pooling_op();
        add_batch_norm_inference_op();
    }
@@ -182,32 +184,14 @@ struct miopen_apply
    {
        apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
            auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
            auto conv = miopen_quant_convolution{op, make_conv(op)};
            auto ws   = conv.compile(ctx, ins->get_shape(), to_shapes(ins->inputs()));
+            auto args      = ins->inputs();
            auto workspace = insert_allocation(ins, ws, "workspace");
            auto output    = insert_allocation(ins, ins->get_shape());
-            return prog->replace_instruction(
+            return prog->replace_instruction(ins, conv, args[0], args[1], workspace, output);
-                ins, conv, ins->inputs().at(0), ins->inputs().at(1), workspace, output);
-        });
-    }
-    void add_quant_dot_op()
-    {
-        apply_map.emplace("quant_dot", [=](instruction_ref ins) {
-            auto&& op      = any_cast<op::quant_dot>(ins->get_operator());
-            auto inputs    = ins->inputs();
-            auto in_shapes = to_shapes(inputs);
-            auto pack_a    = insert_allocation(ins, in_shapes[0], "pack_a");
-            auto pack_b    = insert_allocation(ins, in_shapes[1], "pack_b");
-            auto output    = insert_allocation(ins, ins->get_shape());
-            inputs.push_back(pack_a);
-            inputs.push_back(pack_b);
-            inputs.push_back(output);
-            return prog->replace_instruction(ins, miopen_quant_gemm{op}, inputs);
        });
    }

--- a/src/targets/gpu/pack_int8_args.cpp
+++ b/src/targets/gpu/pack_int8_args.cpp
+#include <migraphx/gpu/pack_int8_args.hpp>
+#include <migraphx/gpu/int8_gemm_pack.hpp>
+#include <migraphx/gpu/int8_conv_pack.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/iterator_for.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+void pack_int8_args::apply(program& p) const
+{
+    for(auto ins : iterator_for(p))
+    {
+        if(ins->name() == "gpu::quant_gemm")
+        {
+            auto inputs = ins->inputs();
+            bool transa = inputs[0]->get_shape().transposed();
+            bool transb = inputs[1]->get_shape().transposed();
+            if(!transb)
+            {
+                auto packed_b = p.insert_instruction(ins, hip_allocate{inputs[1]->get_shape()});
+                auto output_b =
+                    p.insert_instruction(ins, hip_int8_gemm_pack_a{}, {inputs[1], packed_b});
+                instruction::replace_argument(ins, inputs[1], output_b);
+            }
+            if(transa)
+            {
+                auto packed_a = p.insert_instruction(ins, hip_allocate{inputs[0]->get_shape()});
+                auto output_a =
+                    p.insert_instruction(ins, hip_int8_gemm_pack_b{}, {inputs[0], packed_a});
+                instruction::replace_argument(ins, inputs[0], output_a);
+            }
+        }
+        else if(ins->name() == "gpu::quant_convolution")
+        {
+            auto inputs = ins->inputs();
+            auto packed_x =
+                p.insert_instruction(ins, hip_allocate{pack_int8_shape(inputs[0]->get_shape())});
+            auto output_x =
+                p.insert_instruction(ins, miopen_int8_conv_pack{}, {inputs[0], packed_x});
+            instruction::replace_argument(ins, inputs[0], output_x);
+            auto packed_w =
+                p.insert_instruction(ins, hip_allocate{pack_int8_shape(inputs[1]->get_shape())});
+            auto output_w =
+                p.insert_instruction(ins, miopen_int8_conv_pack{}, {inputs[1], packed_w});
+            instruction::replace_argument(ins, inputs[1], output_w);
+        }
+    }
+}
+shape pack_int8_args::pack_int8_shape(const shape& s) const
+{
+    if(s.type() != shape::int8_type)
+    {
+        MIGRAPHX_THROW("PACK_INT8_ARGS: only process int8_type");
+    }
+    auto lens    = s.lens();
+    auto strides = s.strides();
+    lens[1]      = (lens[1] + 3) / 4 * 4;
+    strides[0]   = strides[1] * lens[1];
+    return {s.type(), lens, strides};
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx