Add all layouts and make qdq use fp16 instaead of float

cc30b7c1 · Alan Turner · 734c2e74 · cc30b7c1 · cc30b7c1 · cc30b7c1
Commit cc30b7c1 authored Apr 07, 2023 by Alan Turner
4 changed files
--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -117,23 +117,23 @@ void quantize_int8(program& prog,

    // use all calibration data to run the program to calculate the
    // quantization scale and shift
-    for(auto&& arg : calibration)
-    {
-        parameter_map m;
-        for(auto&& x : capture_prog.get_parameter_shapes())
-        {
-            if(arg.count(x.first) > 0)
-            {
-                assert(x.second == arg.at(x.first).get_shape());
-                m[x.first] = t.copy_to(arg.at(x.first));
-            }
-            else
-            {
-                m[x.first] = t.allocate(x.second);
-            }
-        }
-        capture_prog.eval(m);
-    }
+    // for(auto&& arg : calibration)
+    // {
+    //     parameter_map m;
+    //     for(auto&& x : capture_prog.get_parameter_shapes())
+    //     {
+    //         if(arg.count(x.first) > 0)
+    //         {
+    //             assert(x.second == arg.at(x.first).get_shape());
+    //             m[x.first] = t.copy_to(arg.at(x.first));
+    //         }
+    //         else
+    //         {
+    //             m[x.first] = t.allocate(x.second);
+    //         }
+    //     }
+    //     capture_prog.eval(m);
+    // }

    // print the quantization parameters in only the main module
    if(enabled(MIGRAPHX_INT8_QUANTIZATION_PARAMS{}))

--- a/src/rewrite_quantization.cpp
+++ b/src/rewrite_quantization.cpp
@@ -40,7 +40,7 @@ void apply_quantizelinear(module& m, instruction_ref ins)

    if(x->get_shape().type() != y_scale->get_shape().type())
    {
-        x = m.insert_instruction(ins, make_op("convert", {{"target_type", shape::float_type}}), x);
+        x = m.insert_instruction(ins, make_op("convert", {{"target_type", shape::half_type}}), x);
    }
    auto div            = m.insert_instruction(ins, make_op("div"), x, y_scale);
    auto add_zero_point = m.insert_instruction(ins, make_op("round"), div);
@@ -48,7 +48,7 @@ void apply_quantizelinear(module& m, instruction_ref ins)
    if(ins->inputs().size() == 3)
    {
        auto zero_point = m.insert_instruction(
-            ins, make_op("convert", {{"target_type", shape::float_type}}), ins->inputs()[2]);
+            ins, make_op("convert", {{"target_type", shape::half_type}}), ins->inputs()[2]);
        add_zero_point = m.insert_instruction(ins, make_op("add"), add_zero_point, zero_point);
    }

@@ -73,13 +73,13 @@ void apply_dequantizelinear(module& m, instruction_ref ins)
 {
    assert(ins->name() == "dequantizelinear");
    auto x = m.insert_instruction(
-        ins, make_op("convert", {{"target_type", shape::float_type}}), ins->inputs()[0]);
+        ins, make_op("convert", {{"target_type", shape::half_type}}), ins->inputs()[0]);
    auto x_scale = ins->inputs()[1];

    if(ins->inputs().size() == 3)
    {
        auto x_zero_point = m.insert_instruction(
-            ins, make_op("convert", {{"target_type", shape::float_type}}), ins->inputs()[2]);
+            ins, make_op("convert", {{"target_type", shape::half_type}}), ins->inputs()[2]);
        x = m.insert_instruction(ins, make_op("sub"), x, x_zero_point);
    }


--- a/src/targets/gpu/fuse_ck.cpp
+++ b/src/targets/gpu/fuse_ck.cpp
@@ -60,8 +60,8 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
        return false;
    auto a = ins->inputs().front()->get_shape();
    auto b = ins->inputs().back()->get_shape();
-    if(a.lens().back() > 2048)
-        return false;
+    // if(a.lens().back() > 2048)
+    //     return false;
    return true;
 }

@@ -87,7 +87,7 @@ struct find_ck_gemm_pointwise
        auto gemm_it  = std::find(inputs.begin(), inputs.end(), x_ins);
        auto gemm_idx = gemm_it - inputs.begin();
        assert(gemm_it != inputs.end());
-        if(ins->get_shape().type() != shape::int8_type and ins->get_shape().type())
+        if(ins->get_shape().type() != shape::int8_type)
            return;
        if(gemm_idx != 0)
        {

--- a/src/targets/gpu/jit/ck_gemm.cpp
+++ b/src/targets/gpu/jit/ck_gemm.cpp
@@ -71,47 +71,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;

 using Empty_Tuple   = ck::Tuple<>;

-using GEMM = ck::tensor_operation::device::DeviceGemmMultipleD_Dl<    
-	Row,
-	Row,
-	Empty_Tuple,
-	Row,
-	int8_t,
-	int8_t,
-	int32_t,
-	Empty_Tuple,
-	int8_t, //EDataType
-	PassThrough,
-	PassThrough,
-	PassThrough,
-	ck::tensor_operation::device::GemmSpecialization::MNKPadding,
-	256,
-	128,
-	128,
-	16,
-	4,
-	4,
-	4,
-	1,
-	S<8,2>,
-	S<8,2>,
-	S<8,1,1,4>,
-	S<2,1,128,1>,
-	S<1,2,0,3>,
-	S<1,2,0,3>,
-	S<4,1,1,4>,
-	S<1,2,0,3>,
-	S<1,1,1,4>,
-	S<2,1,4,4>,
-	S<8,1,32,1>,
-	S<0,3,1,2>,
-	S<0,3,1,2>,
-	S<1,1,4,1>,
-	S<0,3,1,2>,
-	S<1,1,4,4>,
-	S<0,1,2,3,4,5>,
-	5,
-	4>;
+using GEMM = ck::tensor_operation::device::${instance1}${padding}${instance2};

 namespace migraphx {

@@ -335,6 +295,34 @@ struct ck_gemm_compiler : compiler<ck_gemm_compiler>
        auto a_shape = inputs[0];
        auto b_shape = inputs[1];
        auto c_shape = inputs.back();
+        auto transa  = transposed_matrix(a_shape);
+        auto transb  = transposed_matrix(b_shape);
+        std::string instance_str1;
+        std::string instance_str2;
+        if (transa and not transb)  
+        {
+            instance_str1 = "DeviceGemmMultipleD_Dl<    Col,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  PassThrough,     ";
+            instance_str2 = ",   256,   128,   128,    16,  4,       4,      4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>";
+
+        }
+        else if (transa and transb)
+        {
+            instance_str1 = "DeviceGemmMultipleD_Dl<    Col,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  PassThrough,     ";
+            instance_str2 = ",   256,   128,   128,    16,  4,      4,       4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>";
+
+        }
+        else if (not transa and not transb)
+        {
+            instance_str1 = "DeviceGemmMultipleD_Dl<    Row,    Row, Empty_Tuple,    Row, int8_t, int8_t, int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  PassThrough,     ";
+            instance_str2 = ",   256,   128,   128,    16,  4,       4,      4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>";
+
+        }
+        else
+        {
+            instance_str1 = "DeviceGemmMultipleD_Dl<    Row,    Col, Empty_Tuple,    Row, int8_t, int8_t, int32_t, Empty_Tuple,  int8_t, PassThrough, PassThrough,  PassThrough,     ";
+            instance_str2 = ",   256,   128,   128,    16,  4,       4,      4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>";
+
+        }

        auto rank           = a_shape.lens().size();
        auto b_strides      = b_shape.strides();
@@ -361,19 +349,26 @@ struct ck_gemm_compiler : compiler<ck_gemm_compiler>
            ip.set_ds_op(v.at("post").to<std::string>());
        }

+        auto m_per_block = 128;
+        auto n_per_block = 128;
+        auto k_per_block = 16;
+
        auto padding = ip.get_pad(config);
        std::string gemm_type;
-        for(auto i : range(padding.size()))
-        {
-            if(padding[i] != 0)
-                gemm_type += keys[i];
-        }
-        if(gemm_type.empty())
-            gemm_type = "Default";
+        // if (int_div_ceil(m, m_per_block) * m_per_block - m != 0)
+        //     gemm_type += "M";
+        // if (int_div_ceil(n, n_per_block) * n_per_block - n != 0)
+        //     gemm_type += "N";
+        // if (int_div_ceil(k, k_per_block) * k_per_block - k != 0)
+        //     gemm_type += "K";
+        if ((int_div_ceil(m, m_per_block) * m_per_block - m != 0) or (int_div_ceil(n, n_per_block) * n_per_block - n != 0))
+            gemm_type = "MNPadding";
        else
-            gemm_type += "Padding";
+            gemm_type = "Default";
        ip.set_gemm("ck::tensor_operation::device::GemmSpecialization::" + gemm_type);
-
+        std::string padding_str = "ck::tensor_operation::device::GemmSpecialization::" + gemm_type;
+        std::cout << padding_str << std::endl;
+        //std::exit(0);
        auto blocks_per_batch = int_div_ceil(m, 128) * int_div_ceil(n, 128);
        ; // ip.get_grid_size(config);

@@ -402,7 +397,9 @@ struct ck_gemm_compiler : compiler<ck_gemm_compiler>
            options.params += " -DMIGRAPHX_CK_CHECK=1";

        auto src = interpolate_string(ck_gemm_kernel,
-                                      {{"instance", ip.str()},
+                                      {{"instance1", instance_str1},
+                                       {"instance2", instance_str2},
+                                       {"padding", padding_str},
                                       {"params", enum_params(inputs.size(), "void * private_p")},
                                       {"args", enum_params(inputs.size(), "private_p")},
                                       {"blocks_per_batch", to_string(blocks_per_batch)},