CK GEMM Int8 Bug Fixes (#2229)

Adds workarounds to avoid passing capture ops and scalar literals from quantization as arguments to ck_gemm.

CK GEMM Int8 Bug Fixes (#2229)
Adds workarounds to avoid passing capture ops and scalar literals from quantization as arguments to ck_gemm.
f47e0b5b · turneram · GitHub · b8b4630b · f47e0b5b · f47e0b5b
Unverified Commit f47e0b5b authored Oct 19, 2023 by turneram Committed by GitHub Oct 20, 2023
4 changed files
--- a/src/rewrite_quantization.cpp
+++ b/src/rewrite_quantization.cpp
@@ -33,6 +33,8 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK_WORKAROUNDS);
+
 void apply_quantizelinear(module& m, instruction_ref ins)
 {
    assert(ins->name() == "quantizelinear");
@@ -62,9 +64,22 @@ void apply_quantizelinear(module& m, instruction_ref ins)
        max_quant = qt.max();
        min_quant = qt.min();
    });
-    auto s        = add_zero_point->get_shape();
-    auto min_arg  = m.add_literal(literal{shape{s.type()}, {min_quant}});
-    auto max_arg  = m.add_literal(literal{shape{s.type()}, {max_quant}});
+    auto s = add_zero_point->get_shape();
+    instruction_ref min_arg;
+    instruction_ref max_arg;
+
+    if(enabled(MIGRAPHX_ENABLE_CK_WORKAROUNDS{}))
+    {
+        std::vector<int> min_data(s.elements(), min_quant);
+        std::vector<int> max_data(s.elements(), max_quant);
+        min_arg = m.add_literal(literal(s, min_data));
+        max_arg = m.add_literal(literal(s, max_data));
+    }
+    else
+    {
+        min_arg = m.add_literal(literal{shape{s.type()}, {min_quant}});
+        max_arg = m.add_literal(literal{shape{s.type()}, {max_quant}});
+    }
    auto saturate = insert_common_op(m, ins, make_op("clip"), {add_zero_point, min_arg, max_arg});
    m.replace_instruction(
        ins, make_op("convert", {{"target_type", ins->get_shape().type()}}), saturate);

--- a/src/targets/gpu/fuse_ck.cpp
+++ b/src/targets/gpu/fuse_ck.cpp
@@ -26,6 +26,7 @@
 #include <migraphx/matcher.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/register_op.hpp>
+#include <migraphx/gpu/device_name.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -92,6 +93,8 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
    auto m = a.lens()[a.lens().size() - 2];
    auto n = b.lens().back();
    auto k = a.lens().back();
+    auto batch_size = std::accumulate(
+        a.lens().rbegin() + 2, a.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
    // Integer gemms must be divisible by 4 in ck
    if(contains({shape::int8_type, shape::int32_type}, ins->get_shape().type()))
    {
@@ -102,9 +105,17 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
        if(k % 4 != 0)
            return false;
    }
-    // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy
-    // to avoid poor-performing GEMM kernels from CK
-    // To-do: Investigate a more precise strategy
+    auto device_name = trim(split_string(get_device_name(), ':').front());
+    if(device_name == "gfx940")
+    {
+        if(ins->get_shape().type() == shape::half_type)
+        {
+            if(batch_size >= 64)
+                return m < 2048 or k <= 64 or n <= 384 or n >= 2048;
+            return true;
+        }
+        return true;
+    }
    return k <= 2048;
 }

@@ -140,6 +151,10 @@ struct find_ck_gemm_pointwise
               return not input->inputs().empty() and input->inputs().front()->name() == "capture";
           }))
            return;
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
+               return not input->inputs().empty() and input->inputs().front()->name() == "capture";
+           }))
+            return;
        assert(gemm_it != inputs.end());
        if(gemm_idx != 0)
        {

--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -42,11 +42,14 @@
 #include <migraphx/op/lrn.hpp>
 #include <migraphx/op/reshape.hpp>
 #include <migraphx/op/unknown.hpp>
+#include <migraphx/env.hpp>

 #include <migraphx/serialize.hpp>

 #include "test.hpp"

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK_WORKAROUNDS);
+
 migraphx::program optimize_onnx(const std::string& name, bool run_passes = false)
 {
    migraphx::onnx_options options;
@@ -5540,6 +5543,31 @@ TEST_CASE(qlinearmatmul_2D_test)
    EXPECT(p.sort() == prog.sort());
 }

+migraphx::instruction_ref insert_quantizelinear_clip(migraphx::module& m,
+                                                     const migraphx::instruction_ref ins,
+                                                     const migraphx::instruction_ref round,
+                                                     const migraphx::shape s,
+                                                     const int64_t min_quant,
+                                                     const int64_t max_quant)
+{
+    migraphx::instruction_ref min_arg;
+    migraphx::instruction_ref max_arg;
+    if(migraphx::enabled(MIGRAPHX_ENABLE_CK_WORKAROUNDS{}))
+    {
+        std::vector<int> min_data(s.elements(), min_quant);
+        std::vector<int> max_data(s.elements(), max_quant);
+        min_arg = m.add_literal(migraphx::literal(s, min_data));
+        max_arg = m.add_literal(migraphx::literal(s, max_data));
+    }
+    else
+    {
+        min_arg = m.add_literal(migraphx::literal{migraphx::shape{s.type()}, {min_quant}});
+        max_arg = m.add_literal(migraphx::literal{migraphx::shape{s.type()}, {max_quant}});
+    }
+
+    return migraphx::insert_common_op(m, ins, migraphx::make_op("clip"), {round, min_arg, max_arg});
+}
+
 TEST_CASE(quantizelinear_test)
 {
    migraphx::program p;
@@ -5548,16 +5576,10 @@ TEST_CASE(quantizelinear_test)
    auto l1  = mm->add_parameter("1", {migraphx::shape::float_type, {1}});
    auto l1_mbcast =
        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {5}}}), l1);
-    auto div     = mm->add_instruction(migraphx::make_op("div"), l0, l1_mbcast);
-    auto round   = mm->add_instruction(migraphx::make_op("round"), div);
-    auto s       = round->get_shape();
-    auto min_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {0}});
-    auto max_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {255}});
-    auto min_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), min_arg);
-    auto max_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), max_arg);
-    auto clip = mm->add_instruction(migraphx::make_op("clip"), round, min_mbcast, max_mbcast);
+    auto div   = mm->add_instruction(migraphx::make_op("div"), l0, l1_mbcast);
+    auto round = mm->add_instruction(migraphx::make_op("round"), div);
+    auto s     = round->get_shape();
+    auto clip  = insert_quantizelinear_clip(*mm, div, round, s, 0, 255);
    mm->add_instruction(
        migraphx::make_op("convert",
                          {{"target_type", migraphx::to_value(migraphx::shape::uint8_type)}}),
@@ -5579,16 +5601,10 @@ TEST_CASE(quantizelinear_int32_test)
        migraphx::make_op("convert",
                          {{"target_type", migraphx::to_value(migraphx::shape::float_type)}}),
        l0);
-    auto div     = mm->add_instruction(migraphx::make_op("div"), l0, l1_mbcast);
-    auto round   = mm->add_instruction(migraphx::make_op("round"), div);
-    auto s       = round->get_shape();
-    auto min_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {0}});
-    auto max_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {255}});
-    auto min_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), min_arg);
-    auto max_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), max_arg);
-    auto clip = mm->add_instruction(migraphx::make_op("clip"), round, min_mbcast, max_mbcast);
+    auto div   = mm->add_instruction(migraphx::make_op("div"), l0, l1_mbcast);
+    auto round = mm->add_instruction(migraphx::make_op("round"), div);
+    auto s     = round->get_shape();
+    auto clip  = insert_quantizelinear_clip(*mm, div, round, s, 0, 255);
    mm->add_instruction(
        migraphx::make_op("convert",
                          {{"target_type", migraphx::to_value(migraphx::shape::uint8_type)}}),
@@ -5615,15 +5631,9 @@ TEST_CASE(quantizelinear_zero_point_test)
        migraphx::make_op("convert",
                          {{"target_type", migraphx::to_value(migraphx::shape::float_type)}}),
        l2_mbcast);
-    auto add     = mm->add_instruction(migraphx::make_op("add"), round, l2_mbcast);
-    auto s       = round->get_shape();
-    auto min_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {-128}});
-    auto max_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {127}});
-    auto min_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), min_arg);
-    auto max_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), max_arg);
-    auto clip = mm->add_instruction(migraphx::make_op("clip"), add, min_mbcast, max_mbcast);
+    auto add  = mm->add_instruction(migraphx::make_op("add"), round, l2_mbcast);
+    auto s    = round->get_shape();
+    auto clip = insert_quantizelinear_clip(*mm, div, add, s, -128, 127);
    mm->add_instruction(
        migraphx::make_op("convert",
                          {{"target_type", migraphx::to_value(migraphx::shape::int8_type)}}),
@@ -5654,15 +5664,9 @@ migraphx::program make_quantizelinear_axis_prog()
        migraphx::make_op("convert",
                          {{"target_type", migraphx::to_value(migraphx::shape::float_type)}}),
        l2_bcast);
-    auto add     = mm->add_instruction(migraphx::make_op("add"), round, l2_bcast);
-    auto s       = round->get_shape();
-    auto min_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {-128}});
-    auto max_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {127}});
-    auto min_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), min_arg);
-    auto max_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), max_arg);
-    auto clip = mm->add_instruction(migraphx::make_op("clip"), add, min_mbcast, max_mbcast);
+    auto add  = mm->add_instruction(migraphx::make_op("add"), round, l2_bcast);
+    auto s    = round->get_shape();
+    auto clip = insert_quantizelinear_clip(*mm, div, add, s, -128, 127);
    mm->add_instruction(
        migraphx::make_op("convert",
                          {{"target_type", migraphx::to_value(migraphx::shape::int8_type)}}),

--- a/test/rewrite_quantization_test.cpp
+++ b/test/rewrite_quantization_test.cpp
@@ -31,10 +31,13 @@
 #include <migraphx/ranges.hpp>
 #include <test.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/env.hpp>

 #include <migraphx/serialize.hpp>
 #include <migraphx/pass_manager.hpp>

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK_WORKAROUNDS);
+
 bool is_quantizelinear(migraphx::instruction& ins) { return ins.name() == "quantizelinear"; }
 bool is_dequantizelinear(migraphx::instruction& ins) { return ins.name() == "dequantizelinear"; }
 bool is_clip_scalar(migraphx::instruction& ins)
@@ -82,7 +85,11 @@ TEST_CASE(quantizelinear)
    EXPECT(any_of(*p1.get_main_module(), &is_quantizelinear));
    EXPECT(none_of(*p2.get_main_module(), &is_quantizelinear));
    // ensure clip literals created in quantized program are scalar
-    EXPECT(any_of(*p2.get_main_module(), &is_clip_scalar));
+    // unless CK workarounds are enabled
+    if(migraphx::enabled(MIGRAPHX_ENABLE_CK_WORKAROUNDS{}))
+        EXPECT(none_of(*p2.get_main_module(), &is_clip_scalar));
+    else
+        EXPECT(any_of(*p2.get_main_module(), &is_clip_scalar));
 }

 TEST_CASE(dequantizelinear)