Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into add-conv_bn_add-test

4a39a0f7 · Shucai Xiao · 5564172e · bb827865 · 4a39a0f7 · 4a39a0f7
Commit 4a39a0f7 authored Oct 11, 2021 by Shucai Xiao
20 changed files
--- a/src/targets/gpu/driver/parser.cpp
+++ b/src/targets/gpu/driver/parser.cpp
+#include <migraphx/gpu/driver/parser.hpp>
+#include <migraphx/gpu/driver/action.hpp>
+#include <iostream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+[[noreturn]] void error(const std::string& msg)
+{
+    std::cout << msg << std::endl;
+    std::abort();
+}
+
+shape parser::parse_shape(const value& v) const
+{
+    auto lens    = get(v, "lens", std::vector<std::size_t>{});
+    auto strides = get(v, "strides", std::vector<std::size_t>{});
+    auto type    = shape::parse_type(get<std::string>(v, "type", "float"));
+    if(strides.empty())
+        return shape{type, lens};
+    else
+        return shape{type, lens, strides};
+}
+
+std::vector<shape> parser::parse_shapes(const value& v) const
+{
+    std::vector<shape> result;
+    std::transform(
+        v.begin(), v.end(), std::back_inserter(result), [&](auto&& x) { return parse_shape(x); });
+    return result;
+}
+
+void parser::load_settings(const value& v)
+{
+    if(v.contains("settings"))
+        settings = v.at("settings");
+}
+
+void parser::process(const value& v)
+{
+    if(not v.is_object())
+        error("Input is not an object");
+    parser p{};
+    p.load_settings(v);
+    for(auto&& pp : v)
+    {
+        if(pp.get_key() == "settings")
+            continue;
+        get_action(pp.get_key())(p, pp.without_key());
+    }
+}
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/driver/perf.cpp
+++ b/src/targets/gpu/driver/perf.cpp
+#include <migraphx/gpu/driver/perf.hpp>
+#include <migraphx/context.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/time.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsigned long seed = 0)
+{
+    std::vector<argument> args;
+    std::transform(shapes.begin(), shapes.end(), std::back_inserter(args), [&](auto& s) {
+        return to_gpu(generate_argument(s, seed++));
+    });
+    return args;
+}
+
+using milliseconds = std::chrono::duration<double, std::milli>;
+double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n)
+{
+    // TODO: Use std::ref
+    migraphx::context gctx = ctx;
+    auto output            = op.compute_shape(inputs);
+    op.finalize(gctx, output, inputs);
+    auto args = generate_arguments(inputs);
+    auto run  = [&] {
+        op.compute(gctx, output, args);
+        gctx.finish();
+    };
+    run();
+    auto r   = range(n);
+    double t = std::accumulate(
+        r.begin(), r.end(), double{0.0}, [&](auto x, auto) { return x + time<milliseconds>(run); });
+    return t / n;
+}
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/driver/run_op.cpp
+++ b/src/targets/gpu/driver/run_op.cpp
+#include <migraphx/gpu/driver/action.hpp>
+#include <migraphx/gpu/driver/perf.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace driver {
+
+struct run_op : action<run_op>
+{
+    static void apply(const parser& p, const value& v)
+    {
+        context ctx;
+        auto inputs = p.parse_shapes(v.at("inputs"));
+        auto name   = v.at("name").to<std::string>();
+        if(not contains(name, "::"))
+            name = "gpu::" + name;
+        auto op  = make_op(name);
+        double t = time_op(ctx, op, inputs);
+        std::cout << op << ": " << t << "ms" << std::endl;
+    }
+};
+
+} // namespace driver
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -185,7 +185,7 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
    if(conv.algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and
       wei.lens()[3] != 3 and contains({{1, 1}}, op.stride))
        return false;
-    return contains({{0, 0}, {1, 1}, {2, 2}}, op.padding) and
+    return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, op.padding) and
           contains({{0, 0}, {1, 1}}, op.stride) and contains({{1, 1}}, op.dilation);
 }

@@ -568,7 +568,7 @@ struct miopen_conv_bias
    {
        check_shapes{inputs, *this}.has(5);
        // TODO: Check slices
-        return op.compute_shape({inputs.at(0), inputs.at(1)});
+        return op.normalize_compute_shape({inputs.at(0), inputs.at(1)});
    }
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
    {
@@ -615,7 +615,7 @@ struct miopen_conv_bias_relu
    {
        check_shapes{inputs, *this}.has(5);
        // TODO: Check slices
-        return op.compute_shape({inputs.at(0), inputs.at(1)});
+        return op.normalize_compute_shape({inputs.at(0), inputs.at(1)});
    }
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
    {
@@ -717,7 +717,7 @@ struct find_gemm_add
        auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());

        // Already fused gemm
-        if(not float_equal(gemm.op.beta, 0))
+        if(not float_equal(gemm.beta, 0))
            return;

        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto i) {
@@ -738,7 +738,7 @@ struct find_gemm_add
        inputs.push_back(copy_ins);
        inputs.push_back(copy_ins);

-        gemm.op.beta = 1;
+        gemm.beta = 1;
        p.replace_instruction(ins, gemm, inputs);
    }
 };

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -37,8 +37,12 @@ R rocblas_invoke(R (*f)(Ts...), Us... xs)
 }

 template <class T>
-void gemm_impl(
-    context& ctx, const shape& output_shape, const std::vector<argument>& args, T alpha, T beta)
+void gemm_impl(context& ctx,
+               const shape& output_shape,
+               const std::vector<argument>& args,
+               T alpha,
+               T beta,
+               bool int8_x4_format)
 {
    bool transa     = args[0].get_shape().transposed();
    bool transb     = args[1].get_shape().transposed();
@@ -62,6 +66,14 @@ void gemm_impl(
    }
    auto compute_type = output_type;

+#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
+    rocblas_gemm_flags flag =
+        int8_x4_format ? rocblas_gemm_flags_pack_int8x4 : rocblas_gemm_flags_none;
+#else
+    (void)int8_x4_format;
+    int flag = 0;
+#endif
+
    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
@@ -72,7 +84,7 @@ void gemm_impl(
        rocblas_int n   = out_lens[dim_1];
        rocblas_int k   = args[0].get_shape().lens()[dim_1];
        auto to_pointer = [&](auto&& arg) { return as.from(arg.data()); };
-        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0)
+        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0 and int8_x4_format)
        {
            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
        }
@@ -109,11 +121,7 @@ void gemm_impl(
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
-                           rocblas_gemm_flags_pack_int8x4);
-#else
-                           0);
-#endif
+                           flag);
        }
        else
        {
@@ -146,11 +154,7 @@ void gemm_impl(
                           compute_type,
                           rocblas_gemm_algo_standard,
                           0,
-#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
-                           rocblas_gemm_flags_pack_int8x4);
-#else
-                           0);
-#endif
+                           flag);
        }
    });
 }
@@ -159,18 +163,20 @@ void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          float alpha,
-          float beta)
+          float beta,
+          bool int8_x4_format)
 {
-    gemm_impl(ctx, output_shape, args, alpha, beta);
+    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format);
 }

 void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          int32_t alpha,
-          int32_t beta)
+          int32_t beta,
+          bool int8_x4_format)
 {
-    gemm_impl(ctx, output_shape, args, alpha, beta);
+    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format);
 }

 } // namespace gpu

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -169,12 +169,26 @@ void gpu_copy(context& ctx, const argument& src, const argument& dst)

 void copy_to_gpu(context& ctx, const argument& src, const argument& dst)
 {
-    gpu_copy(ctx, register_on_gpu(src), dst);
+    if(src.get_shape() == dst.get_shape() and dst.get_shape().packed())
+    {
+        hip_async_copy(ctx, src, dst, hipMemcpyHostToDevice);
+    }
+    else
+    {
+        gpu_copy(ctx, register_on_gpu(src), dst);
+    }
 }

 void copy_from_gpu(context& ctx, const argument& src, const argument& dst)
 {
-    gpu_copy(ctx, src, register_on_gpu(dst));
+    if(src.get_shape() == dst.get_shape() and dst.get_shape().packed())
+    {
+        hip_async_copy(ctx, src, dst, hipMemcpyDeviceToHost);
+    }
+    else
+    {
+        gpu_copy(ctx, src, register_on_gpu(dst));
+    }
 }

 argument get_preallocation(context& ctx, const std::string& id)

--- a/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
@@ -14,6 +14,7 @@ struct gpu_allocation_model
    std::string name() const;
    std::string copy() const;
    operation allocate(const shape& s) const;
+    operation preallocate(const shape& s, const std::string& id) const;
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
@@ -51,6 +51,7 @@ struct code_object_op
        os << "symbol_name=" << op.symbol_name << ",";
        os << "global=" << op.global << ",";
        os << "local=" << op.local << ",";
+        os << "]";
        return os;
    }
 };

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -14,8 +14,9 @@ struct hip_compile_options
    std::size_t local;
    std::vector<shape> inputs;
    shape output;
-    std::string kernel_name = "kernel";
-    std::string params      = "";
+    std::string kernel_name           = "kernel";
+    std::string params                = "";
+    std::vector<shape> reduced_inputs = {};
 };

 operation compile_hip_code_object(const std::string& content, hip_compile_options options);

--- a/src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/operation.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+operation
+compile_pointwise(context& ctx, const std::vector<shape>& inputs, const std::string& lambda);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
--- a/src/targets/gpu/include/migraphx/gpu/device/fill.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/fill.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_FILL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_FILL_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void fill(hipStream_t stream, const argument& result, unsigned long val);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_MULTINOMIAL_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void multinomial(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg0,
+                 const argument& arg1);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/nonzero.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/nonzero.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/prefix_scan_sum.hpp
+#ifndef MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
+#define MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void prefix_scan_sum(hipStream_t stream, const argument& result, const argument& arg, int32_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_DEVICE_PREFIX_SCAN_SUM_HPP
--- a/src/targets/gpu/include/migraphx/gpu/device/reverse.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/reverse.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REVERSE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REVERSE_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument
+reverse(hipStream_t stream, argument result, argument arg1, const std::vector<int64_t>& axes);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/scatter.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/scatter.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SCATTER_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SCATTER_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument scatter(
+    hipStream_t stream, argument result, argument arg0, argument arg1, argument arg2, int64_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/topk.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/topk.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TOPK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TOPK_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument topk_smallest(hipStream_t stream,
+                       const argument& val_res,
+                       const argument& ind_res,
+                       const argument& arg,
+                       int64_t k,
+                       int64_t axis);
+
+argument topk_largest(hipStream_t stream,
+                      const argument& val_res,
+                      const argument& ind_res,
+                      const argument& arg,
+                      int64_t k,
+                      int64_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/where.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/where.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_WHERE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_WHERE_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void where(hipStream_t stream,
+           const argument& result,
+           const argument& arg0,
+           const argument& arg1,
+           const argument& arg2);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device_name.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device_name.hpp
+#ifndef MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP
+#define MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP
+
+#include <migraphx/config.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+std::string get_device_name();
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_DEVICE_NAME_HPP
--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP

+#include <migraphx/errors.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/value.hpp>
 #include <migraphx/shape.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -19,11 +22,17 @@ template <class Op>
 struct rocblas_gemm
 {
    Op op;
+    float alpha         = 1;
+    float beta          = 0;
+    bool int8_x4_format = true;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return migraphx::reflect(self.op, f);
+        return pack_join(migraphx::reflect(self.op, f),
+                         pack(f(self.alpha, "alpha"),
+                              f(self.beta, "beta"),
+                              f(self.int8_x4_format, "int8_x4_format")));
    }

    std::string name() const
@@ -42,14 +51,25 @@ struct rocblas_gemm
        check_shapes{in_shapes, *this}.not_broadcasted();
        batch_not_transposed(inputs[0].strides());
        batch_not_transposed(inputs[1].strides());
-
-        std::size_t kdim = inputs[0].lens().size() - 1;
-        // k be multiple of 4
-        if(op.name() == "quant_dot" && (inputs[0].lens()[kdim] % 4) != 0)
+        // if gemm and add are fused
+        if(not float_equal(beta, 0))
        {
-            MIGRAPHX_THROW("GPU_GEMM: size of A {" + to_string_range(inputs[0].lens()) +
-                           "} and B {" + to_string_range(inputs[1].lens()) +
-                           "} must be multiple of 4 for int8 type");
+            auto cmat_shape = in_shapes.back();
+            in_shapes.pop_back();
+            auto op_out_shape = op.compute_shape(in_shapes);
+            if(cmat_shape.lens() != op_out_shape.lens())
+            {
+                MIGRAPHX_THROW(this->name() + " : dimension mismatch, operand C: {" +
+                               to_string_range(cmat_shape.lens()) +
+                               "}, cannot add to operand A * B: {" +
+                               to_string_range(op_out_shape.lens()) + "}");
+            }
+            if(cmat_shape.type() != op_out_shape.type())
+            {
+                MIGRAPHX_THROW(this->name() + " : operand C type mismatch, operand C is of type: " +
+                               to_string(cmat_shape.type()) +
+                               ", it must be: " + to_string(op_out_shape.type()));
+            }
        }

        return op.compute_shape(in_shapes);
@@ -58,7 +78,14 @@ struct rocblas_gemm
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
-        gemm(ctx, output_shape, args, op.alpha, op.beta);
+        if(this->name() == "gpu::gemm")
+        {
+            gemm(ctx, output_shape, args, alpha, beta, int8_x4_format);
+        }
+        else
+        {
+            gemm(ctx, output_shape, args, int32_t(alpha), int32_t(beta), int8_x4_format);
+        }
        return args.back();
    }