adjust tests

00b0396b · Khalique · 78bcbe2c · 0628e570 · 00b0396b · 00b0396b
Commit 00b0396b authored Aug 19, 2019 by Khalique
20 changed files
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -4,7 +4,9 @@
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/batch_norm.hpp>
 #include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
+#include <migraphx/op/quant_dot.hpp>
 #include <migraphx/op/elu.hpp>
 #include <migraphx/op/im2col.hpp>
 #include <migraphx/op/leaky_relu.hpp>
@@ -216,6 +218,61 @@ struct cpu_convolution
    }
 };

+struct cpu_quant_convolution
+{
+    op::quant_convolution op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "cpu::quant_convolution"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto output = result.get<int32_t>();
+        visit_all(args[0], args[1])([&](auto input, auto weights) {
+            auto in   = input.get_shape().lens();
+            auto in_h = in[2];
+            auto in_w = in[3];
+
+            auto wei   = weights.get_shape().lens();
+            auto wei_n = wei[0];
+            auto wei_c = wei[1];
+            auto wei_h = wei[2];
+            auto wei_w = wei[3];
+
+            par_dfor(output_shape.lens()[0],
+                     output_shape.lens()[1],
+                     output_shape.lens()[2],
+                     output_shape.lens()[3])(
+                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                    const auto start_x  = i * op.stride[0] - op.padding[0];
+                    const auto start_y  = j * op.stride[1] - op.padding[1];
+                    const auto group_id = w / (wei_n / op.group);
+
+                    int32_t acc = 0;
+                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                        const auto in_x  = start_x + x;
+                        const auto in_y  = start_y + y;
+                        const auto in_ch = group_id * wei_c + k;
+                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                        {
+                            acc += static_cast<int32_t>(input(o, in_ch, in_x, in_y)) *
+                                   weights(w, k, x, y);
+                        }
+                    });
+                    output(o, w, i, j) = acc;
+                });
+        });
+
+        return result;
+    }
+};
+
 struct cpu_im2col
 {
    op::im2col op;
@@ -433,7 +490,7 @@ struct cpu_gemm
    {
        argument result{output_shape};
        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrics, and C is broadcastable to A * B
+        // A and B are matrices, and C is of the same shape as A * B
        if(args.size() == 3)
        {
            // no need to consider the value of args[2]
@@ -460,6 +517,72 @@ struct cpu_gemm
    }
 };

+struct cpu_quant_gemm
+{
+    op::quant_dot op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "cpu::quant_dot"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        if(inputs.size() == 3)
+        {
+            auto c_shape = inputs.at(2);
+            check_shapes{{c_shape}}.not_broadcasted();
+        }
+        return op.compute_shape(inputs);
+    }
+
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        // 3 inputs, it is alpha * A * B + beta * C, then
+        // A and B are matrices, and C is of the same shape to A * B
+
+        // first, convert the args[0] and args[1] from int8_t to int32_t
+        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
+        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
+        arg_0.visit([&](auto output) {
+            args.at(0).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+
+        arg_1.visit([&](auto output) {
+            args.at(1).visit(
+                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
+        });
+
+        if(args.size() == 3)
+        {
+            // no need to consider the value of args[2]
+            if(op.beta == 0)
+            {
+                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
+            }
+            else
+            {
+                visit_all(result, args[2])([&](auto output, auto input) {
+                    std::copy(input.begin(), input.end(), output.begin());
+                });
+            }
+
+            migemm(result, arg_0, arg_1, op.alpha, op.beta);
+
+            return result;
+        }
+
+        // 2 input arguments
+        migemm(result, arg_0, arg_1, op.alpha, int32_t{0});
+
+        return result;
+    }
+};
+
 struct leaky_relu_op
 {
    op::leaky_relu op;
@@ -671,15 +794,17 @@ struct cpu_apply
    {
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
-        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
-        apply_map["elu"]         = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
-        apply_map["leaky_relu"]  = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["logsoftmax"]  = extend_op<cpu_logsoftmax, op::logsoftmax>();
-        apply_map["lrn"]         = extend_op<cpu_lrn, op::lrn>();
-        apply_map["pad"]         = extend_op<cpu_pad, op::pad>();
-        apply_map["softmax"]     = extend_op<cpu_softmax, op::softmax>();
+        apply_map["convolution"]       = extend_op<cpu_convolution, op::convolution>();
+        apply_map["dot"]               = extend_op<cpu_gemm, op::dot>();
+        apply_map["quant_dot"]         = extend_op<cpu_quant_gemm, op::quant_dot>();
+        apply_map["quant_convolution"] = extend_op<cpu_quant_convolution, op::quant_convolution>();
+        apply_map["elu"]               = extend_op<cpu_unary<elu_op>, op::elu>();
+        apply_map["im2col"]            = extend_op<cpu_im2col, op::im2col>();
+        apply_map["leaky_relu"]        = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["logsoftmax"]        = extend_op<cpu_logsoftmax, op::logsoftmax>();
+        apply_map["lrn"]               = extend_op<cpu_lrn, op::lrn>();
+        apply_map["pad"]               = extend_op<cpu_pad, op::pad>();
+        apply_map["softmax"]           = extend_op<cpu_softmax, op::softmax>();
    }

    void apply()

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -39,6 +39,7 @@ add_library(migraphx_device
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/int8_gemm_pack.cpp
    device/div.cpp
    device/clip.cpp
    device/reduce_sum.cpp
@@ -64,8 +65,10 @@ add_library(migraphx_gpu
    target.cpp
    lowering.cpp
    gemm.cpp
+    quant_gemm.cpp
    pooling.cpp
    convolution.cpp
+    quant_convolution.cpp
    softmax.cpp
    logsoftmax.cpp
    contiguous.cpp
@@ -79,12 +82,16 @@ add_library(migraphx_gpu
    elu.cpp
    pad.cpp
    gather.cpp
+    convert.cpp
    lrn.cpp
    schedule_model.cpp
    adjust_allocation.cpp
+    pack_int8_args.cpp
    clip.cpp
    reduce_sum.cpp
    reduce_mean.cpp
+    int8_gemm_pack.cpp
+    int8_conv_pack.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/convert.cpp
+++ b/src/targets/gpu/convert.cpp
+#include <migraphx/gpu/convert.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/convert.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_convert::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    check_shapes{inputs}.packed();
+    return op.compute_shape(inputs);
+}
+
+argument hip_convert::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::convert(ctx.get_stream().get(), args[1], args[0]);
+    return args[1];
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
@@ -16,6 +16,12 @@ struct hip_array
    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](std::size_t i) { return d[i]; }
    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](std::size_t i) const { return d[i]; }

+    MIGRAPHX_DEVICE_CONSTEXPR T& front() { return d[0]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& front() const { return d[0]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& back() { return d[N - 1]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& back() const { return d[N - 1]; }
+
    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }


--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -209,28 +209,15 @@ constexpr std::size_t compute_block_size(std::size_t n, std::size_t max_block_si
 }

 template <class Op, class T, class Input, class Output>
-void reduce(hipStream_t stream,
-            const argument& result,
-            const argument& arg,
-            Op op,
-            T init,
-            Input read_input,
-            Output read_output)
+void reduce_multi_impl(hipStream_t stream,
+                       const argument& result,
+                       const argument& arg,
+                       Op op,
+                       T init,
+                       Input read_input,
+                       Output read_output,
+                       const shape& reduce_slice)
 {
-    auto&& output_shape = result.get_shape();
-    auto&& input_shape  = arg.get_shape();
-    std::vector<std::size_t> reduce_lens;
-    std::transform(output_shape.lens().begin(),
-                   output_shape.lens().end(),
-                   input_shape.lens().begin(),
-                   std::back_inserter(reduce_lens),
-                   [](auto x, auto y) -> std::size_t {
-                       if(x == y)
-                           return 1;
-                       else
-                           return y;
-                   });
-    shape reduce_slice{output_shape.type(), reduce_lens};
    hip_visit_all(result, arg, reduce_slice)([&](auto output, auto input, auto reduce_shape) {
        auto nelements = result.get_shape().elements();
        auto relements = reduce_slice.elements();
@@ -250,6 +237,83 @@ void reduce(hipStream_t stream,
    });
 }

+template <class Op, class T, class Input, class Output>
+void reduce_standard_impl(hipStream_t stream,
+                          const argument& result,
+                          const argument& arg,
+                          Op op,
+                          T init,
+                          Input read_input,
+                          Output read_output,
+                          std::size_t relements,
+                          std::size_t stride)
+{
+    hip_visit_all(result, arg)([&](auto output, auto input) {
+        auto nelements = result.get_shape().elements();
+
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(relements, max_block_size);
+        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
+            const auto out_idx  = i / block_size;
+            const auto base_idx = out_idx * stride;
+            auto r = block_reduce<max_block_size>(idx, op, init, relements, [&](auto j) __device__ {
+                return read_input(input.data()[base_idx + j]);
+            });
+            if(idx.local == 0)
+                output.data()[out_idx] = read_output(r);
+        });
+    });
+}
+
+template <class Op, class T, class Input, class Output>
+void reduce(hipStream_t stream,
+            const argument& result,
+            const argument& arg,
+            Op op,
+            T init,
+            Input read_input,
+            Output read_output)
+{
+    auto&& output_shape = result.get_shape();
+    auto&& input_shape  = arg.get_shape();
+    if(input_shape.standard() and output_shape.standard() and
+       output_shape.lens().back() != input_shape.lens().back() and
+       std::equal(output_shape.lens().begin(),
+                  std::prev(output_shape.lens().end()),
+                  input_shape.lens().begin()))
+    {
+        std::size_t stride = std::accumulate(input_shape.strides().begin(),
+                                             input_shape.strides().end(),
+                                             1,
+                                             std::multiplies<size_t>());
+        reduce_standard_impl(stream,
+                             result,
+                             arg,
+                             op,
+                             init,
+                             read_input,
+                             read_output,
+                             input_shape.lens().back(),
+                             stride);
+    }
+    else
+    {
+        std::vector<std::size_t> reduce_lens;
+        std::transform(output_shape.lens().begin(),
+                       output_shape.lens().end(),
+                       input_shape.lens().begin(),
+                       std::back_inserter(reduce_lens),
+                       [](auto x, auto y) -> std::size_t {
+                           if(x == y)
+                               return 1;
+                           else
+                               return y;
+                       });
+        shape reduce_slice{output_shape.type(), reduce_lens};
+        reduce_multi_impl(stream, result, arg, op, init, read_input, read_output, reduce_slice);
+    }
+}
+
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
@@ -31,6 +31,7 @@ struct hip_tensor_descriptor
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
        }
+
        return result;
    }
    __device__ __host__ std::size_t linear(hip_tensor_index<NDim> s) const

--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/int8_gemm_pack.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto comp_shape    = arg.get_shape();
+    auto out_lens      = comp_shape.lens();
+    auto dim_0         = out_lens.size() - 2;
+    auto dim_1         = out_lens.size() - 1;
+    std::size_t lda    = comp_shape.strides()[dim_0];
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = comp_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
+            gs_launch(stream, nelements, 256)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_m    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
+                    in_ptr[i_m + i_k * lda + offset];
+            });
+        });
+    });
+}
+
+void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto trans_shape = arg.get_shape();
+    auto out_lens    = trans_shape.lens();
+    auto dim_0       = trans_shape.lens().size() - 2;
+    auto dim_1       = trans_shape.lens().size() - 1;
+    std::size_t ldb  = trans_shape.strides()[dim_1];
+
+    auto wrap_lens = out_lens;
+    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
+    shape comp_shape{trans_shape.type(), wrap_lens};
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = comp_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
+            gs_launch(stream, nelements, 256)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_n    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
+                    in_ptr[i_n + i_k * ldb + offset];
+            });
+        });
+    });
+}
+
+void sync_stream(hipStream_t stream) { hipStreamSynchronize(stream); }
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -134,8 +134,6 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
    auto conv = any_cast<miopen_convolution>(ins->get_operator());
    if(conv.op.group > 1)
        return false;
-    if(conv.op.padding_mode != op::padding_mode_t::default_)
-        return false;
    if(wei.lens()[1] > 512 and conv.algo != miopenConvolutionFwdAlgoWinograd)
        return false;
    auto op = conv.op;

--- a/src/targets/gpu/gemm.cpp
+++ b/src/targets/gpu/gemm.cpp
@@ -233,6 +233,10 @@ argument miopen_gemm::compute(context& ctx,
        auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
        if(num_matrices == 1)
        {
+            // the rocblas_gemm API handles inputs and output matrices as
+            // column-major format. When doing a C = A * B, we actually do
+            // C^T = (B^T) * (A^T). That is the reason we input args[1] as
+            // A and args[0] as B in calling the rocblas_gemm.
            generic_rocblas_gemm(as,
                                 ctx.get_stream().get_rocblas(),
                                 transb ? rocblas_operation_transpose : rocblas_operation_none,

--- a/src/targets/gpu/include/migraphx/gpu/convert.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convert.hpp
@@ -3,8 +3,6 @@

 #include <migraphx/shape.hpp>
 #include <migraphx/op/convert.hpp>
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/convert.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -12,7 +10,7 @@ namespace gpu {

 struct context;

-struct hip_convert : unary_device<hip_convert, device::convert>
+struct hip_convert
 {
    op::convert op;

@@ -22,13 +20,15 @@ struct hip_convert : unary_device<hip_convert, device::convert>
        return migraphx::reflect(self.op, f);
    }

-    hip_convert(op::convert oper) : op(oper) {}
+    std::string name() const { return "gpu::convert"; }

-    shape compute_shape(std::vector<shape> inputs) const
+    shape compute_shape(std::vector<shape> inputs) const;
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
-        inputs.pop_back();
-        check_shapes{inputs}.packed();
-        return op.compute_shape(inputs);
+        return shapes.size() - 1;
    }
 };


--- a/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/int8_gemm_pack.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_INT8_GEMM_PACK_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg);
+
+void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
+
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/config.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct miopen_int8_conv_pack
+{
+    std::string name() const { return "gpu::int8_conv_pack"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP
+
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/config.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_int8_gemm_pack_a
+{
+    std::string name() const { return "gpu::int8_gemm_pack_a"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+struct hip_int8_gemm_pack_b
+{
+    std::string name() const { return "gpu::int8_gemm_pack_b"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -34,11 +34,11 @@ Result make_obj(F f, Ts... xs)
    auto status                = f(&x, xs...);
    Result r{x};
    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen call failed");
+        MIGRAPHX_THROW("MAKE_OBJ: MIOpen call failed");
    return r;
 }

-inline tensor_descriptor make_tensor(const migraphx::shape& s)
+inline tensor_descriptor make_tensor(const migraphx::shape& s, bool pack = false)
 {
    auto t = make_obj<tensor_descriptor>(&miopenCreateTensorDescriptor);
    // Convert to ints
@@ -49,13 +49,33 @@ inline tensor_descriptor make_tensor(const migraphx::shape& s)
        d = miopenFloat;
    else if(s.type() == shape::half_type)
        d = miopenHalf;
+    else if(s.type() == shape::int32_type)
+        d = miopenInt32;
+    else if(s.type() == shape::int8_type)
+    {
+        if(pack)
+        {
+            // update the lens and corresponding strides
+            d          = miopenInt8x4;
+            lens[1]    = ((lens[1] + 3) / 4) * 4;
+            strides[0] = strides[1] * lens[1];
+        }
+        else
+        {
+            d = miopenInt8;
+        }
+    }
    else
-        MIGRAPHX_THROW("Unsupported type");
+    {
+        MIGRAPHX_THROW("MAKE_TENSOR: unsupported type");
+    }
    miopenSetTensorDescriptor(t.get(), d, s.lens().size(), lens.data(), strides.data());
+
    return t;
 }

-inline convolution_descriptor make_conv(const migraphx::op::convolution& op)
+template <class T>
+inline convolution_descriptor make_conv(const T& op)
 {
    auto c = make_obj<convolution_descriptor>(&miopenCreateConvolutionDescriptor);
    miopenConvolutionMode_t c_mode = miopenConvolution;

--- a/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/pack_int8_args.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_PACK_INT8_ARGS_HPP
+#define MIGRAPHX_GUARD_RTGLIB_PACK_INT8_ARGS_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+struct pack_int8_args
+{
+    std::string name() const { return "gpu::pack_int8_args"; }
+    void apply(program& p) const;
+    shape pack_int8_shape(const shape& s) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/gpu/miopen.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct miopen_quant_convolution
+{
+    op::quant_convolution op;
+    shared<convolution_descriptor> cd;
+    miopenConvFwdAlgorithm_t algo{};
+    miopenHandle_t handle = nullptr;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        // TODO: Add algo
+        return op::quant_convolution::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::quant_convolution"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+
+    private:
+    shape pack_int8_shape(const shape& s) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_QUANT_GEMM_HPP
+#define MIGRAPHX_GUARD_RTGLIB_QUANT_GEMM_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/quant_dot.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct rocblas_quant_gemm
+{
+    op::quant_dot op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::quant_gemm"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    void batch_not_transposed(const std::vector<std::size_t>& strides) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/int8_conv_pack.cpp
+++ b/src/targets/gpu/int8_conv_pack.cpp
+#include <migraphx/gpu/int8_conv_pack.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape miopen_int8_conv_pack::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{{inputs.at(0)}, *this}.has(1).standard();
+    return inputs.at(0);
+}
+
+argument
+miopen_int8_conv_pack::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    auto arg_desc      = make_tensor(args[0].get_shape());
+    auto arg_desc_vec4 = make_tensor(args[0].get_shape(), true);
+
+    float alpha = 1;
+    float beta  = 0;
+    // pack input to vec4 format
+    auto status = miopenTransformTensor(ctx.get_stream().get_miopen(),
+                                        &alpha,
+                                        arg_desc.get(),
+                                        args[0].implicit(),
+                                        &beta,
+                                        arg_desc_vec4.get(),
+                                        args[1].implicit());
+    if(status != miopenStatusSuccess)
+    {
+        MIGRAPHX_THROW("INT8_CONV_PACK: transform input tensor failed");
+    }
+
+    return args[1];
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/int8_gemm_pack.cpp
+++ b/src/targets/gpu/int8_gemm_pack.cpp
+#include <migraphx/gpu/int8_gemm_pack.hpp>
+#include <migraphx/gpu/device/int8_gemm_pack.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_int8_gemm_pack_a::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{{inputs.at(0)}, *this}.has(1).not_broadcasted().packed();
+    return inputs.at(0);
+}
+
+argument
+hip_int8_gemm_pack_a::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::int8_gemm_pack_a(ctx.get_stream().get(), args[1], args[0]);
+    return args[1];
+}
+
+shape hip_int8_gemm_pack_b::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{{inputs.at(0)}, *this}.has(1).not_broadcasted().packed();
+    return inputs.at(0);
+}
+
+argument
+hip_int8_gemm_pack_b::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::int8_gemm_pack_b(ctx.get_stream().get(), args[1], args[0]);
+    return args[1];
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -16,6 +16,7 @@
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
+#include <migraphx/gpu/quant_convolution.hpp>
 #include <migraphx/gpu/contiguous.hpp>
 #include <migraphx/gpu/relu.hpp>
 #include <migraphx/gpu/sigmoid.hpp>
@@ -46,6 +47,7 @@
 #include <migraphx/gpu/batchnorm.hpp>
 #include <migraphx/gpu/pooling.hpp>
 #include <migraphx/gpu/gemm.hpp>
+#include <migraphx/gpu/quant_gemm.hpp>
 #include <migraphx/gpu/concat.hpp>
 #include <migraphx/gpu/pad.hpp>
 #include <migraphx/gpu/gather.hpp>
@@ -58,6 +60,7 @@
 #include <migraphx/gpu/reduce_mean.hpp>
 #include <migraphx/gpu/pow.hpp>
 #include <migraphx/gpu/sqdiff.hpp>
+#include <migraphx/gpu/int8_conv_pack.hpp>
 #include <utility>
 #include <functional>
 #include <algorithm>
@@ -115,6 +118,7 @@ struct miopen_apply
        add_generic_op<hip_sign>("sign");

        add_extend_op<miopen_gemm, op::dot>("dot");
+        add_extend_op<rocblas_quant_gemm, op::quant_dot>("quant_dot");
        add_extend_op<miopen_contiguous, op::contiguous>("contiguous");
        add_extend_op<hip_concat, op::concat>("concat");
        add_extend_op<hip_softmax, op::softmax>("softmax");
@@ -130,6 +134,8 @@ struct miopen_apply

        add_lrn_op();
        add_convolution_op();
+        add_quant_convolution_op();
+        // add_quant_dot_op();
        add_pooling_op();
        add_batch_norm_inference_op();
    }
@@ -176,6 +182,21 @@ struct miopen_apply
        });
    }

+    void add_quant_convolution_op()
+    {
+        apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
+            auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
+            auto conv = miopen_quant_convolution{op, make_conv(op)};
+            auto ws   = conv.compile(ctx, ins->get_shape(), to_shapes(ins->inputs()));
+
+            auto args      = ins->inputs();
+            auto workspace = insert_allocation(ins, ws, "workspace");
+            auto output    = insert_allocation(ins, ins->get_shape());
+
+            return prog->replace_instruction(ins, conv, args[0], args[1], workspace, output);
+        });
+    }
+
    void add_pooling_op()
    {
        apply_map.emplace("pooling", [=](instruction_ref ins) {