gemm example: compiles into normal executable and .so

f945f40a · Astha Rai · 3af15212 · f945f40a · 3af15212 · f945f40a
Commit f945f40a authored Apr 20, 2023 by Astha Rai
19 changed files
--- a/python/ait_impl/generation/Makefile
+++ b/python/ait_impl/generation/Makefile
 gemm: xx.o

-CFLAGS=-I ~/rocm/composable_kernel/include -I /opt/rocm-5.1.1/hip/include -I ~/rocm/composable_kernel/include/ -I ~/rocm/composable_kernel/include/ck/ -I ~/rocm/composable_kernel/include/ck/problem_transform/ -I ~/rocm/composable_kernel/include/ck/tensor/ -I ~/rocm/composable_kernel/include/ck/tensor_description/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/rocm/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/rocm/composable_kernel/library/include/ck/library/host/ -I ~/rocm/composable_kernel/library/include/ck/library/host_tensor/ -I ~/rocm/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/rocm/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/rocm/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/rocm/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/rocm/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/rocm/composable_kernel/library/include/ck/library/tensor_op/ -I ~/rocm/composable_kernel/library/include/ck/library/utility/ -I ~/rocm/composable_kernel/profiler/include/
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 

 CXXFLAGS = -std=c++17

 xx.o: 
-	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm/rocrand -lrocrand -x hip -c -o xx.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  xx.cpp
--- a/python/ait_impl/generation/demo.py
+++ b/python/ait_impl/generation/demo.py
-import jinja2
-
-SHAPE_EVAL_TEMPLATE = jinja2.Template(
-    """
-    int M = *in_{{ range(rank - 1)|join(' * *in_') }};
-    int N = *in_{{rank - 1}};
-    static constexpr auto I0 = Number<0>{};
-            static constexpr auto I1 = Number<1>{};
-            static constexpr auto I2 = Number<2>{};
-            static constexpr auto I3 = Number<3>{};
-            static constexpr auto I4 = Number<4>{};
-            static constexpr auto I5 = Number<5>{};
-
-            static constexpr auto K1Number = Number<K1>{};
-    """
-)
-
-output = SHAPE_EVAL_TEMPLATE.render(rank=2);
-print (output)
\ No newline at end of file
--- a/python/ait_impl/generation/ex/Makefile
+++ b/python/ait_impl/generation/ex/Makefile
+
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
+    
\ No newline at end of file
--- a/python/ait_impl/generation/ex/ex.cpp
+++ b/python/ait_impl/generation/ex/ex.cpp
+
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
+            ck::half_t,
+            ck::half_t,
+            ck::half_t,
+            float,
+            ck::tensor_layout::gemm::ColumnMajor,
+            ck::tensor_layout::gemm::RowMajor,
+            ck::tensor_layout::gemm::RowMajor,
+            ck::tensor_operation::element_wise::PassThrough,
+            ck::tensor_operation::element_wise::PassThrough,
+            ck::tensor_operation::element_wise::PassThrough,
+            ck::tensor_operation::device::GemmSpecialization::Default,
+            256,
+            128,
+            128,
+            16,
+            2,
+            4,
+            4,
+            1,
+            S<8, 2>,
+            S<8, 2>,
+            S<2, 1, 4, 2>,
+            S<8, 1,  32, 1>,
+            S<0, 3, 1, 2>,
+            S<0, 3, 1, 2>,
+            S<1, 1, 4, 1>,
+            S<0, 3, 1, 2>,
+            S<1, 1, 4, 2>,
+            S<2, 1, 4, 2>,
+            S<8, 1, 32, 1>,
+            S<0, 3, 1, 2>,
+            S<0, 3, 1, 2>,
+            S<1, 1, 4, 1>,
+            S<0, 3, 1, 2>,
+            S<1, 1, 4, 2>,
+            S<0, 1, 2, 3, 4, 5>,
+            5,
+            4>;
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ck::half_t> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ck::tensor_layout::gemm::ColumnMajor{}));
+    Tensor<ck::half_t> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ck::tensor_layout::gemm::RowMajor{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ck::half_t>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<ck::half_t>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ck::half_t>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<ck::half_t>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<ck::half_t> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<ck::half_t> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ck::half_t) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(ck::half_t) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(ck::half_t) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = ck::tensor_operation::element_wise::PassThrough{};
+    auto b_element_op = ck::tensor_operation::element_wise::PassThrough{};
+    auto c_element_op = ck::tensor_operation::element_wise::PassThrough{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<ck::half_t*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<ck::half_t*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<ck::half_t*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(ck::half_t) * M * K + sizeof(ck::half_t) * K * N + sizeof(ck::half_t) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/python/ait_impl/generation/ex/gemm_ex.py
+++ b/python/ait_impl/generation/ex/gemm_ex.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import subprocess
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.make_template =     """
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
+    """
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
+            ${type_a},
+            ${type_b},
+            ${type_c},
+            ${type_acc},
+            ${layout_a},
+            ${layout_b},
+            ${layout_c},
+            ${elementwise_op_a},
+            ${elementwise_op_b},
+            ${elementwise_op_c},
+            ${Gemm_spec},
+            ${block_size},
+            ${mperblock},
+            ${nperblock},
+            ${k0perblock},
+            ${k1},
+            ${m1perthread},
+            ${n1perthread},
+            ${kperthread},
+            ${m1n1_thcluster_m1xs},
+            ${m1n1_thcluster_n1xs},
+            ${ABT_thread_slice_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_arrange_order},
+            ${ABT_src_access_order},
+            ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
+            ${ABT_src_vec_tensor_cont_dim_order},
+            ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
+            ${BBT_thread_slice_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_arrange_order},
+            ${BBT_src_access_order},
+            ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
+            ${BBT_src_vec_tensor_cont_dim_order},
+            ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
+            ${CTT_src_dst_access_order},
+            ${CTT_src_dst_vec_dim},
+            ${CTT_dst_scalar_per_vector}>;
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
+    Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = ${elementwise_op_a}{};
+    auto b_element_op = ${elementwise_op_b}{};
+    auto c_element_op = ${elementwise_op_c}{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+            'type_b' : 'ck::half_t',
+            'type_c' : 'ck::half_t',
+            'type_acc' : 'float',
+            'layout_a' : 'ck::tensor_layout::gemm::ColumnMajor',
+            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
+            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
+            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
+            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::Default',
+            'block_size' : '256',
+            'mperblock' : '128',
+            'nperblock' : '128',
+            'k0perblock' : '16',
+            'k1' : '2',
+            'm1perthread' : '4',
+            'n1perthread' : '4',
+            'kperthread' : '1',
+            'm1n1_thcluster_m1xs' : 'S<8, 2>',
+            'm1n1_thcluster_n1xs' : 'S<8, 2>',
+            'ABT_thread_slice_lengths_K0_M0_M1_K1' : 'S<2, 1, 4, 2>',
+            'ABT_thread_cluster_lengths_K0_M0_M1_K1' : 'S<8, 1,  32, 1>',
+            'ABT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_access_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 1>',
+            'ABT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 2>',
+            'BBT_thread_slice_lengths_K0_N0_N1_K1' : 'S<2, 1, 4, 2>',
+            'BBT_thread_cluster_lengths_K0_N0_N1_K1' : 'S<8, 1, 32, 1>',
+            'BBT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_access_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : 'S<1, 1, 4, 1>',
+            'BBT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': 'S<1, 1, 4, 2>',
+            'CTT_src_dst_access_order' : 'S<0, 1, 2, 3, 4, 5>',
+            'CTT_src_dst_vec_dim' : '5',
+            'CTT_dst_scalar_per_vector' : '4'
+        }
+        template = self.gemm_devop_template
+        cf = open("ex.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+        m_template = self.make_template
+        cf = open("Makefile", 'w')
+        print(SubstituteTemplate(m_template, values))
+        cf.write(SubstituteTemplate(m_template, values))
+        cf.close()
+
+        PIPE = -1
+        STDOUT = -2
+
+        proc = subprocess.Popen(
+        ["make"],
+        shell=True,
+        env=os.environ.copy(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        )
+
+        out, err = proc.communicate()
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/normal/Makefile
+++ b/python/ait_impl/generation/ex/normal/Makefile
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
--- a/python/ait_impl/generation/ex/normal/gemm_ex.py
+++ b/python/ait_impl/generation/ex/normal/gemm_ex.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import subprocess
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.make_template =     """
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
+    """
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
+            ${type_a},
+            ${type_b},
+            ${type_c},
+            ${type_acc},
+            ${layout_a},
+            ${layout_b},
+            ${layout_c},
+            ${elementwise_op_a},
+            ${elementwise_op_b},
+            ${elementwise_op_c},
+            ${Gemm_spec},
+            ${block_size},
+            ${mperblock},
+            ${nperblock},
+            ${k0perblock},
+            ${k1},
+            ${m1perthread},
+            ${n1perthread},
+            ${kperthread},
+            ${m1n1_thcluster_m1xs},
+            ${m1n1_thcluster_n1xs},
+            ${ABT_thread_slice_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_arrange_order},
+            ${ABT_src_access_order},
+            ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
+            ${ABT_src_vec_tensor_cont_dim_order},
+            ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
+            ${BBT_thread_slice_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_arrange_order},
+            ${BBT_src_access_order},
+            ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
+            ${BBT_src_vec_tensor_cont_dim_order},
+            ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
+            ${CTT_src_dst_access_order},
+            ${CTT_src_dst_vec_dim},
+            ${CTT_dst_scalar_per_vector}>;
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
+    Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = ${elementwise_op_a}{};
+    auto b_element_op = ${elementwise_op_b}{};
+    auto c_element_op = ${elementwise_op_c}{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+            'type_b' : 'ck::half_t',
+            'type_c' : 'ck::half_t',
+            'type_acc' : 'float',
+            'layout_a' : 'ck::tensor_layout::gemm::ColumnMajor',
+            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
+            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
+            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
+            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::Default',
+            'block_size' : '256',
+            'mperblock' : '128',
+            'nperblock' : '128',
+            'k0perblock' : '16',
+            'k1' : '2',
+            'm1perthread' : '4',
+            'n1perthread' : '4',
+            'kperthread' : '1',
+            'm1n1_thcluster_m1xs' : 'S<8, 2>',
+            'm1n1_thcluster_n1xs' : 'S<8, 2>',
+            'ABT_thread_slice_lengths_K0_M0_M1_K1' : 'S<2, 1, 4, 2>',
+            'ABT_thread_cluster_lengths_K0_M0_M1_K1' : 'S<8, 1,  32, 1>',
+            'ABT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_access_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 1>',
+            'ABT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 2>',
+            'BBT_thread_slice_lengths_K0_N0_N1_K1' : 'S<2, 1, 4, 2>',
+            'BBT_thread_cluster_lengths_K0_N0_N1_K1' : 'S<8, 1, 32, 1>',
+            'BBT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_access_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : 'S<1, 1, 4, 1>',
+            'BBT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': 'S<1, 1, 4, 2>',
+            'CTT_src_dst_access_order' : 'S<0, 1, 2, 3, 4, 5>',
+            'CTT_src_dst_vec_dim' : '5',
+            'CTT_dst_scalar_per_vector' : '4'
+        }
+        template = self.gemm_devop_template
+        cf = open("ex.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+        m_template = self.make_template
+        cf = open("Makefile", 'w')
+        print(SubstituteTemplate(m_template, values))
+        cf.write(SubstituteTemplate(m_template, values))
+        cf.close()
+
+        PIPE = -1
+        STDOUT = -2
+
+        proc = subprocess.Popen(
+        ["make"],
+        shell=True,
+        env=os.environ.copy(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        )
+
+        out, err = proc.communicate()
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/normal/gemm_ex_code.py
+++ b/python/ait_impl/generation/ex/normal/gemm_ex_code.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
+         <ADataType, 
+         BDataType, 
+         CDataType, 
+         AccDataType, 
+         ALayout, 
+         BLayout, 
+         CLayout,  
+         AElementOp,  
+         BElementOp,  
+         CElementOp,    
+         GemmDefault,   
+         256,   
+         128,   
+         128,    
+         16,  
+         2,          
+         4,          
+         4,      
+         1,       
+         S<8, 2>,       
+         S<8, 2>,      
+         S<2, 1, 4, 2>,      
+         S<8, 1,  32, 1>,  
+         S<0, 3, 1, 2>,  
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 1>,      
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 2>,      
+         S<2, 1, 4, 2>,       
+         S<8, 1, 32, 1>,  
+         S<0, 3, 1, 2>,  
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 1>,      
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 2>, 
+         S<0, 1, 2, 3, 4, 5>,               
+         5,                  
+         4>;
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+        }
+        template = self.gemm_devop_template
+        cf = open("xx.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/normal/input.py
+++ b/python/ait_impl/generation/ex/normal/input.py
+#take in input for gemm from user, send it to example template
\ No newline at end of file
--- a/python/ait_impl/generation/ex/potential/Makefile
+++ b/python/ait_impl/generation/ex/potential/Makefile
+CC = /opt/rocm/bin/hipcc
+CK_PATH=/dockerx/composable_kernel/
+CFLAGS = -O3 -std=c++17 -DCK_AMD_GPU_GFX90A --offload-arch=gfx90a -I"${CK_PATH}/include" -I"${CK_PATH}/library/include" -I"${CK_PATH}/profiler/include"
+OBJS = ex.o host_tensor.o device_memory.o
+
+all: $(OBJS)
+	$(CC) $(CFLAGS) $(OBJS) -o ex
+
+device_memory.o: ../../library/src/utility/device_memory.cpp
+	$(CC) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../library/src/utility/host_tensor.cpp
+	$(CC) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp
\ No newline at end of file
--- a/python/ait_impl/generation/ex/shared/Makefile
+++ b/python/ait_impl/generation/ex/shared/Makefile
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+test.so: ex.o host_tensor.o device_memory.o
+	hipcc -shared $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o test.so
+
+device_memory.o: ../../../../../library/src/utility/device_memory.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
--- a/python/ait_impl/generation/ex/shared/gemm_ex.py
+++ b/python/ait_impl/generation/ex/shared/gemm_ex.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import subprocess
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.make_template =     """
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
+    """
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
+            ${type_a},
+            ${type_b},
+            ${type_c},
+            ${type_acc},
+            ${layout_a},
+            ${layout_b},
+            ${layout_c},
+            ${elementwise_op_a},
+            ${elementwise_op_b},
+            ${elementwise_op_c},
+            ${Gemm_spec},
+            ${block_size},
+            ${mperblock},
+            ${nperblock},
+            ${k0perblock},
+            ${k1},
+            ${m1perthread},
+            ${n1perthread},
+            ${kperthread},
+            ${m1n1_thcluster_m1xs},
+            ${m1n1_thcluster_n1xs},
+            ${ABT_thread_slice_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_arrange_order},
+            ${ABT_src_access_order},
+            ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
+            ${ABT_src_vec_tensor_cont_dim_order},
+            ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
+            ${BBT_thread_slice_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_arrange_order},
+            ${BBT_src_access_order},
+            ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
+            ${BBT_src_vec_tensor_cont_dim_order},
+            ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
+            ${CTT_src_dst_access_order},
+            ${CTT_src_dst_vec_dim},
+            ${CTT_dst_scalar_per_vector}>;
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
+    Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = ${elementwise_op_a}{};
+    auto b_element_op = ${elementwise_op_b}{};
+    auto c_element_op = ${elementwise_op_c}{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+            'type_b' : 'ck::half_t',
+            'type_c' : 'ck::half_t',
+            'type_acc' : 'float',
+            'layout_a' : 'ck::tensor_layout::gemm::ColumnMajor',
+            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
+            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
+            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
+            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::Default',
+            'block_size' : '256',
+            'mperblock' : '128',
+            'nperblock' : '128',
+            'k0perblock' : '16',
+            'k1' : '2',
+            'm1perthread' : '4',
+            'n1perthread' : '4',
+            'kperthread' : '1',
+            'm1n1_thcluster_m1xs' : 'S<8, 2>',
+            'm1n1_thcluster_n1xs' : 'S<8, 2>',
+            'ABT_thread_slice_lengths_K0_M0_M1_K1' : 'S<2, 1, 4, 2>',
+            'ABT_thread_cluster_lengths_K0_M0_M1_K1' : 'S<8, 1,  32, 1>',
+            'ABT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_access_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 1>',
+            'ABT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 2>',
+            'BBT_thread_slice_lengths_K0_N0_N1_K1' : 'S<2, 1, 4, 2>',
+            'BBT_thread_cluster_lengths_K0_N0_N1_K1' : 'S<8, 1, 32, 1>',
+            'BBT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_access_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : 'S<1, 1, 4, 1>',
+            'BBT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': 'S<1, 1, 4, 2>',
+            'CTT_src_dst_access_order' : 'S<0, 1, 2, 3, 4, 5>',
+            'CTT_src_dst_vec_dim' : '5',
+            'CTT_dst_scalar_per_vector' : '4'
+        }
+        template = self.gemm_devop_template
+        cf = open("ex.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+        m_template = self.make_template
+        cf = open("Makefile", 'w')
+        print(SubstituteTemplate(m_template, values))
+        cf.write(SubstituteTemplate(m_template, values))
+        cf.close()
+
+        PIPE = -1
+        STDOUT = -2
+
+        proc = subprocess.Popen(
+        ["make"],
+        shell=True,
+        env=os.environ.copy(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        )
+
+        out, err = proc.communicate()
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/shared/gemm_ex_code.py
+++ b/python/ait_impl/generation/ex/shared/gemm_ex_code.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
+         <ADataType, 
+         BDataType, 
+         CDataType, 
+         AccDataType, 
+         ALayout, 
+         BLayout, 
+         CLayout,  
+         AElementOp,  
+         BElementOp,  
+         CElementOp,    
+         GemmDefault,   
+         256,   
+         128,   
+         128,    
+         16,  
+         2,          
+         4,          
+         4,      
+         1,       
+         S<8, 2>,       
+         S<8, 2>,      
+         S<2, 1, 4, 2>,      
+         S<8, 1,  32, 1>,  
+         S<0, 3, 1, 2>,  
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 1>,      
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 2>,      
+         S<2, 1, 4, 2>,       
+         S<8, 1, 32, 1>,  
+         S<0, 3, 1, 2>,  
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 1>,      
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 2>, 
+         S<0, 1, 2, 3, 4, 5>,               
+         5,                  
+         4>;
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+        }
+        template = self.gemm_devop_template
+        cf = open("xx.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/shared/input.py
+++ b/python/ait_impl/generation/ex/shared/input.py
+#take in input for gemm from user, send it to example template
\ No newline at end of file
--- a/python/ait_impl/generation/gemm_dev_op.py
+++ b/python/ait_impl/generation/gemm_dev_op.py
-import enum
-import os.path
-import shutil
-import functools
-import operator
-import collections
-import re
-
-def SubstituteTemplate(template, values):
-    text = template
-    changed = True
-    while changed:
-        changed = False
-        for key, value in values.items():
-            regex = "\\$\\{%s\\}" % key
-            newtext = re.sub(regex, value, text)
-            if newtext != text:
-                changed = True
-            text = newtext
-    return text
-
-
-class EmitGemmInstance:
-    def __init__(self):
-        self.gemm_devop_template =     """
-        #pragma once
-
-        #include <iostream>
-        #include <sstream>
-
-        #include "ck/utility/common_header.hpp"
-        #include "ck/tensor_description/tensor_descriptor.hpp"
-        #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-        #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-        #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
-        #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-        #include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
-        #include "ck/host_utility/device_prop.hpp"
-        #include "ck/host_utility/kernel_launch.hpp"
-        namespace ck {
-        namespace tensor_operation {
-        namespace device {
-
-        template <
-            typename ${type_a},
-            typename ${type_b},
-            typename ${type_c},
-            typename ${type_acc},
-            typename ${layout_a},
-            typename ${layout_b},
-            typename ${layout_c},
-            typename ${elementwise_op_a},
-            typename ${elementwise_op_b},
-            typename ${elementwise_op_c},
-            ${Gemm_spec},
-            ${block_size},
-            ${mperblock},
-            ${nperblock},
-            ${k0perblock},
-            ${k1},
-            ${m1perthread},
-            ${n1perthread},
-            ${kperthread},
-            typename ${m1n1_thcluster_m1xs},
-            typename ${m1n1_thcluster_n1xs},
-            typename ${ABT_thread_slice_lengths_K0_M0_M1_K1},
-            typename ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
-            typename ${ABT_thread_cluster_arrange_order},
-            typename ${ABT_src_access_order},
-            typename ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
-            typename ${ABT_src_vec_tensor_cont_dim_order},
-            typename ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
-            typename ${BBT_thread_slice_lengths_K0_N0_N1_K1},
-            typename ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
-            typename ${BBT_thread_cluster_arrange_order},
-            typename ${BBT_src_access_order},
-            typename ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
-            typename ${BBT_src_vec_tensor_cont_dim_order},
-            typename ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
-            typename ${CTT_src_dst_access_order},
-            ${CTT_src_dst_vec_dim},
-            ${CTT_dst_scalar_per_vector}>
-        struct DeviceGemmDl : public DeviceGemm<${layout_a},
-                                                ${layout_b},
-                                                ${layout_c},
-                                                ${type_a},
-                                                ${type_b},
-                                                ${type_c},
-                                                ${elementwise_op_a},
-                                                ${elementwise_op_b},
-                                                ${elementwise_op_c}>
-
-        {
-            static constexpr auto I0 = Number<0>{};
-            static constexpr auto I1 = Number<1>{};
-            static constexpr auto I2 = Number<2>{};
-            static constexpr auto I3 = Number<3>{};
-            static constexpr auto I4 = Number<4>{};
-            static constexpr auto I5 = Number<5>{};
-
-            static constexpr auto K1Number = Number<${k1}>{};
-
-            static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
-            {
-                assert(K % ${k1} == 0);
-
-                const index_t K0 = K / ${k1};
-
-                const auto a_grid_desc_m_k = [&]() {
-                    if constexpr(is_same<tensor_layout::gemm::RowMajor, ${layout_a}>::value)
-                    {
-                        return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-                    }
-                    else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ${layout_a}>::value)
-                    {
-                        return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-                    }
-                }();
-
-                if constexpr(${Gemm_spec} == GemmSpecialization::MNPadding)
-                {
-                    const auto PadM = (${mperblock} - M % ${mperblock}) % ${mperblock};
-
-                    return transform_tensor_descriptor(
-                        a_grid_desc_m_k,
-                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                    make_right_pad_transform(M, PadM)),
-                        make_tuple(Sequence<1>{}, Sequence<0>{}),
-                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-                }
-                else
-                {
-                    return transform_tensor_descriptor(
-                        a_grid_desc_m_k,
-                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                    make_pass_through_transform(M)),
-                        make_tuple(Sequence<1>{}, Sequence<0>{}),
-                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-                }
-            }
-
-            static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
-    {
-        assert(K % ${k1} == 0);
-
-        const index_t K0 = K / ${k1};
-
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ${layout_b}>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ${layout_B}>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        if constexpr(${Gemm_spec} == GemmSpecialization::MNPadding)
-        {
-            const auto PadN = (${nperblock} - N % ${nperblock}) % ${nperblock};
-
-            return transform_tensor_descriptor(
-                b_grid_desc_k_n,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_k_n,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ${layout_c}>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ${layout_c}>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        if constexpr(${Gemm_spec} == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (${mperblock} - M % ${mperblock}) % ${mperblock};
-            const auto PadN = (${nperblock} - N % ${nperblock}) % ${nperblock};
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-    }
-
-    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
-    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
-    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
-        // GridwiseGemm
-    using GridwiseGemm =
-        GridwiseGemmDl_km_kn_mn_v1r3<BlockSize,
-                                     ${type_a},
-                                     ${type_acc},
-                                     ${type_c},
-                                     InMemoryDataOperationEnum::Set,
-                                     AGridDesc_K0_M_K1,
-                                     BGridDesc_K0_N_K1,
-                                     CGridDesc_M_N,
-                                     ${mperblock},
-                                     ${nperblock},
-                                     ${k0perblock},
-                                     ${k1},
-                                     ${m1perthread},
-                                     ${n1perthread},
-                                     ${kperthread},
-                                     ${m1n1_thcluster_m1xs},
-                                     ${m1n1_thcluster_n1xs},
-                                     ${ABT_thread_slice_lengths_K0_M0_M1_K1},
-                                     ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
-                                     ${ABT_thread_cluster_arrange_order},
-                                     ${ABT_src_access_order},
-                                     ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
-                                     ${ABT_src_vec_tensor_cont_dim_order},
-                                     ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
-                                     ${BBT_thread_slice_lengths_K0_N0_N1_K1},
-                                     ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
-                                     ${BBT_thread_cluster_arrange_order},
-                                     ${BBT_src_access_order},
-                                     ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
-                                     ${BBT_src_vec_tensor_cont_dim_order},
-                                     ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
-                                     ${CTT_src_dst_access_order},
-                                     ${CTT_src_dst_vec_dim},
-                                     ${CTT_dst_scalar_per_vector}>;
-
-    using AGridDesc_K0_M0_M1_K1 =
-        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
-    using BGridDesc_K0_N0_N1_K1 =
-        decltype(GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{}));
-    using CGridDesc_M0_M10_M11_N0_N10_N11 =
-        decltype(GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{}));
-    using DefaultBlock2CTileMap =
-        decltype(GridwiseGemm::MakeDefaultBlock2CTileMap(CGridDesc_M_N{}));
-
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ${type_a}* p_a_grid,
-                 const ${type_b}* p_b_grid,
-                 ${type_c}* p_c_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 index_t M01,
-                 index_t N01,
-                 ${elementwise_op_a} a_element_op,
-                 ${elementwise_op_b} b_element_op,
-                 ${elementwise_op_c} c_element_op)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              a_grid_desc_k0_m0_m1_k1_{},
-              b_grid_desc_k0_n0_n1_k1_{},
-              c_grid_desc_m0_m10_m11_n0_n10_n11_{},
-              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
-        {
-            a_grid_desc_k0_m_k1_ = DeviceGemmDl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
-            b_grid_desc_k0_n_k1_ = DeviceGemmDl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
-            c_grid_desc_m_n_     = DeviceGemmDl::MakeCGridDescriptor_M_N(M, N, StrideC);
-
-            if(GridwiseGemm::CheckValidity(
-                   a_grid_desc_k0_m_k1_, b_grid_desc_k0_n_k1_, c_grid_desc_m_n_))
-            {
-                a_grid_desc_k0_m0_m1_k1_ =
-                    GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(a_grid_desc_k0_m_k1_);
-                b_grid_desc_k0_n0_n1_k1_ =
-                    GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(b_grid_desc_k0_n_k1_);
-                c_grid_desc_m0_m10_m11_n0_n10_n11_ =
-                    GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(c_grid_desc_m_n_);
-
-                block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_);
-            }
-        }
-
-        //  private:
-        const ${type_a}* p_a_grid_;
-        const ${type_b}* p_b_grid_;
-        ${type_c}* p_c_grid_;
-
-        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
-        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-
-        AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1_;
-        BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1_;
-        CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11_;
-
-        DefaultBlock2CTileMap block_2_ctile_map_;
-
-        // TODO: unused, but may be useful in future.
-        index_t M01_;
-        index_t N01_;
-
-        // TODO: unused since gridwise_gemm_dl_v1r3 does NOT support prologue for the time being.
-        ${elementwise_op_a} a_element_op_;
-        ${elementwise_op_b} b_element_op_;
-        ${elementwise_op_c} c_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public BaseInvoker
-    {
-        using Argument = DeviceGemmDl::Argument;
-
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            {
-                std::cout << "arg.a_grid_desc_k0_m0_m1_k1_{"
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_k0_n0_n1_k1_{"
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I0) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
-            }
-
-            if(!GridwiseGemm::CheckValidity(
-                   arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_))
-            {
-                throw std::runtime_error(
-                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdl_v2r3 has invalid setting");
-            }
-
-            const index_t grid_size = GridwiseGemm::CalculateGridSize(
-                arg.c_grid_desc_m_n_.GetLength(I0), arg.c_grid_desc_m_n_.GetLength(I1));
-
-            const auto K0                    = arg.a_grid_desc_k0_m0_m1_k1_.GetLength(I0);
-            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
-            const bool has_double_tail_k_block_loop =
-                GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
-
-            float ave_time = 0;
-
-            if(has_main_k_block_loop && has_double_tail_k_block_loop)
-            {
-                const auto kernel =
-                    kernel_gemm_dl_v1r3<GridwiseGemm,
-                                        ${type_a},
-                                        ${type_c},
-                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
-                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
-                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
-                                        remove_reference_t<DefaultBlock2CTileMap>,
-                                        true,
-                                        true>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(${block_size}),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m0_m1_k1_,
-                                                  arg.b_grid_desc_k0_n0_n1_k1_,
-                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
-                                                  arg.block_2_ctile_map_);
-            }
-            else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-            {
-                const auto kernel =
-                    kernel_gemm_dl_v1r3<GridwiseGemm,
-                                        ${type_a},
-                                        ${type_c},
-                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
-                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
-                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
-                                        remove_reference_t<DefaultBlock2CTileMap>,
-                                        true,
-                                        false>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(${block_size}),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m0_m1_k1_,
-                                                  arg.b_grid_desc_k0_n0_n1_k1_,
-                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
-                                                  arg.block_2_ctile_map_);
-            }
-            else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-            {
-                const auto kernel =
-                    kernel_gemm_dl_v1r3<GridwiseGemm,
-                                        ${type_a},
-                                        ${type_c},
-                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
-                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
-                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
-                                        remove_reference_t<DefaultBlock2CTileMap>,
-                                        false,
-                                        true>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(${block_size}),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m0_m1_k1_,
-                                                  arg.b_grid_desc_k0_n0_n1_k1_,
-                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
-                                                  arg.block_2_ctile_map_);
-            }
-            else
-            {
-                const auto kernel =
-                    kernel_gemm_dl_v1r3<GridwiseGemm,
-                                        ${type_a},
-                                        ${type_c},
-                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
-                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
-                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
-                                        remove_reference_t<DefaultBlock2CTileMap>,
-                                        false,
-                                        false>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(${block_size}),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m0_m1_k1_,
-                                                  arg.b_grid_desc_k0_n0_n1_k1_,
-                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
-                                                  arg.block_2_ctile_map_);
-            }
-
-            return ave_time;
-        }
-
-        // polymorphic
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030")
-        {
-            return GridwiseGemm::CheckValidity(
-                arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
-        }
-        else
-        {
-            return false;
-        }
-    }
-
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-
-    static auto MakeArgument(const ${type_a}* p_a,
-                             const ${type_b}* p_b,
-                             ${type_c}* p_c,
-                             index_t M,
-                             index_t N,
-                             index_t K,
-                             index_t StrideA,
-                             index_t StrideB,
-                             index_t StrideC,
-                             ${elementwise_op_a} a_element_op,
-                             ${elementwise_op_b} b_element_op,
-                             ${elementwise_op_c} c_element_op)
-    {
-        return Argument{p_a,
-                        p_b,
-                        p_c,
-                        M,
-                        N,
-                        K,
-                        StrideA,
-                        StrideB,
-                        StrideC,
-                        1,
-                        1,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    // polymorphic
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
-                                                      const void* p_b,
-                                                      void* p_c,
-                                                      index_t M,
-                                                      index_t N,
-                                                      index_t K,
-                                                      index_t StrideA,
-                                                      index_t StrideB,
-                                                      index_t StrideC,
-                                                      ${elementwise_op_a} a_element_op,
-                                                      ${elementwise_op_b} b_element_op,
-                                                      ${elementwise_op_c} c_element_op) override
-    {
-        return std::make_unique<Argument>(static_cast<const ${type_a}*>(p_a),
-                                          static_cast<const ${type_b}*>(p_b),
-                                          static_cast<${type_c}*>(p_c),
-                                          M,
-                                          N,
-                                          K,
-                                          StrideA,
-                                          StrideB,
-                                          StrideC,
-                                          1,
-                                          1,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
-    }
-
-    // polymorphic
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    // polymorphic
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceGemmDl"
-            << "<"
-            << ${block_size} << ", "
-            << ${mperblock} << ", "
-            << ${nperblock} << ", "
-            << ${k0perblock} << ", "
-            << ${k1} << ", "
-            << ${m1perthread} << ", "
-            << ${n1perthread} << ", "
-            << ${kperthread}
-            << ">";
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-    """
-
-    def emit(self):
-        values = {
-            'type_a' : 'ck::half_t',
-            'type_b' : 'ck::half_t',
-            'type_c' : 'ck::half_t',
-            'type_acc' : 'float',
-            'layout_a' : 'ck::tensor_layout::gemm::ColMajor',
-            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
-            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
-            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
-            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
-            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
-            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::Default',
-            'block_size' : '256',
-            'mperblock' : '128',
-            'nperblock' : '128',
-            'k0perblock' : '16',
-            'k1' : '2',
-            'm1perthread' : '4',
-            'n1perthread' : '4',
-            'kperthread' : '1',
-            'm1n1_thcluster_m1xs' : 'S<8, 2>',
-            'm1n1_thcluster_n1xs' : 'S<8, 2>',
-            'ABT_thread_slice_lengths_K0_M0_M1_K1' : 'S<2, 1, 4, 2>',
-            'ABT_thread_cluster_lengths_K0_M0_M1_K1' : 'S<8, 1,  32, 1>',
-            'ABT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
-            'ABT_src_access_order' : 'S<0, 3, 1, 2>',
-            'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 1>',
-            'ABT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
-            'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 2>',
-            'BBT_thread_slice_lengths_K0_N0_N1_K1' : 'S<2, 1, 4, 2>',
-            'BBT_thread_cluster_lengths_K0_N0_N1_K1' : 'S<8, 1, 32, 1>',
-            'BBT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
-            'BBT_src_access_order' : 'S<0, 3, 1, 2>',
-            'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : 'S<1, 1, 4, 1>',
-            'BBT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
-            'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': 'S<1, 1, 4, 2>',
-            'CTT_src_dst_access_order' : 'S<0, 1, 2, 3, 4, 5>',
-            'CTT_src_dst_vec_dim' : '5',
-            'CTT_dst_scalar_per_vector' : '4'
-        }
-        template = self.gemm_devop_template
-        cf = open("xx.cpp", 'w')
-        print(SubstituteTemplate(template, values))
-        cf.write(SubstituteTemplate(template, values))
-        cf.close()
-
-
-a = EmitGemmInstance()
-a.emit()
-
-
--- a/python/ait_impl/generation/gemm_kernel.py
+++ b/python/ait_impl/generation/gemm_kernel.py
-import enum
-import os.path
-import shutil
-import functools
-import operator
-import collections
-import re
-
-def SubstituteTemplate(template, values):
-    text = template
-    changed = True
-    while changed:
-        changed = False
-        for key, value in values.items():
-            regex = "\\$\\{%s\\}" % key
-            newtext = re.sub(regex, value, text)
-            if newtext != text:
-                changed = True
-            text = newtext
-    return text
-
-
-class EmitGemmInstance:
-    def __init__(self):
-        self.gemm_kernel_template =     """
-        template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_K0_M0_M1_K1,
-          typename BGridDesc_K0_N0_N1_K1,
-          typename CGridDesc_M0_M10_M11_N0_N10_N11,
-          typename Block2CTileMap,
-          bool HasMainKBlockLoop,
-          bool HasDoubleTailKBlockLoop>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_gemm_dl_v1r3(const ${type_ab}* __restrict__ ${p_a_grid},
-                            const ${type_ab}* __restrict__ ${p_b_grid},
-                            ${type_c}* __restrict__ ${p_c_grid},
-                            const ${A_GridDesc_K0_M_K1} ${a_grid_desc_k0_m0_m1_k1},
-                            const ${BGridDesc_K0_N_K1} ${b_grid_desc_k0_n0_n1_k1},
-                            const ${CGridDesc_M0_M10_M11_N0_N10_N11} ${c_grid_desc_m0_m10_m11_n0_n10_n11},
-                            const Block2CTileMap ${block_2_ctile_map})
-{
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(${type_ab});
-
-    __shared__ ${type_ab} p_shared_block[shared_block_size];
-
-    GridwiseGemm::Run(${p_a_grid},
-                      ${p_b_grid},
-                      ${p_c_grid},
-                      p_shared_block,
-                      ${a_grid_desc_k0_m0_m1_k1},
-                      ${b_grid_desc_k0_n0_n1_k1},
-                      ${c_grid_desc_m0_m10_m11_n0_n10_n11},
-                      ${block_2_ctile_map},
-                      integral_constant<bool, HasMainKBlockLoop>{},
-                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
-}
-
-template <index_t BlockSize,
-          ${type_ab},
-          ${type_acc},
-          ${type_c},
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          ${A_GridDesc_K0_M_K1},
-          ${BGridDesc_K0_N_K1},
-          ${CGridDesc_M_N},
-          ${mperblock},
-          ${nperblock},
-          ${k0perblock},
-          ${k1value},
-          ${M1PerThreadM111},
-          ${N1PerThreadN111},
-          ${KPerThread},
-          ${M11N11ThreadClusterM110Xs},
-          ${M11N11ThreadClusterN110Xs},
-          ${ABlockTransferThreadSliceLengths_K0_M0_M1_K1},
-          ${ABlockTransferThreadClusterLengths_K0_M0_M1_K1},
-          ${ABlockTransferThreadClusterArrangeOrder},
-          ${ABlockTransferSrcAccessOrder},
-          ${ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1},
-          ${ABlockTransferSrcVectorTensorContiguousDimOrder},
-          ${ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1},
-          ${BBlockTransferThreadSliceLengths_K0_N0_N1_K1},
-          ${BBlockTransferThreadClusterLengths_K0_N0_N1_K1},
-          ${BBlockTransferThreadClusterArrangeOrder},
-          ${BBlockTransferSrcAccessOrder},
-          ${BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1},
-          ${BBlockTransferSrcVectorTensorContiguousDimOrder},
-          ${BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1},
-          ${CThreadTransferSrcDstAccessOrder},
-          ${CThreadTransferSrcDstVectorDim},
-          ${CThreadTransferDstScalarPerVector}>
-          struct GridwiseGemmDl_km_kn_mn_v1r3
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
-
-    // K1 should be Number<...>
-    static constexpr auto K1 = Number<K1Value>{};
-
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
-    {
-        // TODO: change this. I think it needs multi-dimensional alignment
-        constexpr auto max_lds_align = K1;
-
-        // TODO: check alignment
-        // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_block_desc_k_m = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
-
-        // TODO: check alignment
-        // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_block_desc_k_n = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
-
-        // TODO: check alignment
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_aligned_space_size =
-            math::integer_least_multiple(a_block_desc_k_m.GetElementSpaceSize(), max_lds_align);
-
-        constexpr auto b_block_aligned_space_size =
-            math::integer_least_multiple(b_block_desc_k_n.GetElementSpaceSize(), max_lds_align);
-
-        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
-    }
-          """
-
-def emit(self):
-        values = {
-            'function_name': "gemm",
-            'type_a' : 'ck::half_t',
-            'type_b' : 'ck::half_t',
-            'type_c' : 'ck::half_t',
-            'type_acc' : 'float',
-            'layout_a' : 'ck::tensor_layout::gemm::RowMajor',
-            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
-            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
-            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
-            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
-            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
-            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::MNKPadding',
-            'block_size' : '256',
-            'mperblock' : '64',
-            'nperblock' : '128',
-            'kperblock' : '32',
-            'k1' : '8',
-            'mperxdl' : '32',
-            'nperxdl' : '32',
-            'mxdlperwave' : '1',
-            'nxdlperwave' : '2',
-            'threadclusterlength_a' : 'ck::Sequence<4,64,1>',
-            'threadclusterarrange_a' : 'ck::Sequence<1,0,2>',
-            'srcaccessorder_a' : 'ck::Sequence<1,0,2>',
-            'srcvectordim_a' : '2',
-            'srcscalarpervec_a' : '8',
-            'dstscalarpervec_a' : '8',
-            'add_extra_dim_a' : '1',
-            'threadclusterlength_b' : 'ck::Sequence<8,32,1>',
-            'threadclusterarrange_b' : 'ck::Sequence<0,2,1>',
-            'srcaccessorder_b' : 'ck::Sequence<0,2,1>',
-            'srcvectordim_b' : '1',
-            'srcscalarpervec_b' : '4',
-            'dstscalarpervec_b' : '2',
-            'add_extra_dim_b' : '0',
-            'dstscalarpervec_c' : '8'
-        }
-        template = self.gemm_template
-        print(SubstituteTemplate(template, values))
\ No newline at end of file
--- a/python/ait_impl/generation/norm_ex.py
+++ b/python/ait_impl/generation/norm_ex.py
-import os
-import re
-from hashlib import sha1
-from typing import Any, Dict, OrderedDict
-
-import jinja2
-
-#from ...target import Target
-
-#templating
-
-FUNC_CALL_PARAM_TEMPLATE = jinja2.Template("(void *)({{name}})")
-
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-using {{name}} = {{ config_name }};
-"""
-)
-
-ARGS_PARSE_TEMPLATE = jinja2.Template(
-    """
-{% for idx in range(rank) %}
-  const int64_t in_{{idx}} = std::stoi(argv[{{ idx + 1 }}]);
-{% endfor %}
-"""
-)
-
-STRUCTS_DEF_TEMPLATE = jinja2.Template(
-    """
-
-struct ProfilerMemoryPool {
-  ProfilerMemoryPool() {
-    std::random_device rd;
-    gen = std::mt19937(rd());
-    uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
-    offsets.reserve(512);
-    strides.reserve(512);
-    copies.reserve(512);
-    ptrs.reserve(512);
-  }
-  ~ProfilerMemoryPool() {
-    for(int i = 0; i < ptrs.size(); i++){
-      hipFree(ptrs[i]);
-    }
-  }
-
-  template <typename DType>
-  DType* AllocateGaussianTensor(int64_t size) {
-    size_t length = size * sizeof(DType);
-    DType *d_x;
-    hipMalloc(&d_x, length);
-
-    float mean = 0.0f;
-    float stddev = 1.0f;
-    uint64_t seed = uniform_dist(gen);
-    rocrand_set_seed(generator, seed);
-    rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
-    return d_x;
-  }
-
-  ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
-    return reinterpret_cast<ck::half_t*>(
-        AllocateGaussianTensor<ck::half_t>(size));
-  }
-
-  int AllocateHalfTensor(int64_t size, int64_t copy) {
-    offsets.push_back(0);
-    strides.push_back(size);
-    copies.push_back(copy);
-    auto ptr = AllocateHalfGaussianTensor(size * copy);
-    ptrs.push_back(reinterpret_cast<void*>(ptr));
-    return ptrs.size() - 1;
-  }
-
-  ck::half_t* RequestHalfTensorByIdx(int idx) {
-    auto copy = copies.at(idx);
-    auto offset = offsets.at(idx);
-    auto stride = strides.at(idx);
-    ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
-    ptr += offset;
-    offset += stride;
-    if (offset == copy * stride) {
-        offset = 0;
-    }
-    offsets[idx] = offset;
-    return ptr;
-  }
-  std::vector<int64_t> offsets;
-  std::vector<int64_t> strides;
-  std::vector<int64_t> copies;
-  std::vector<void*> ptrs;
-  std::mt19937 gen;
-  std::uniform_int_distribution<int64_t> uniform_dist;
-  rocrand_generator generator;
-};
-
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p) const
-{
-  hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const
-{
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
-struct KernelTimerImpl
-{
-  KernelTimerImpl() {
-    hipGetErrorString(hipEventCreate(&mStart));
-    hipGetErrorString(hipEventCreate(&mEnd));
-  }
-  ~KernelTimerImpl() {
-    hipGetErrorString(hipEventDestroy(mStart));
-    hipGetErrorString(hipEventDestroy(mEnd));
-  }
-  void Start() {
-    hipGetErrorString(hipDeviceSynchronize());
-    hipGetErrorString(hipEventRecord(mStart, nullptr));
-  }
-  void End() {
-    hipGetErrorString(hipEventRecord(mEnd, nullptr));
-    hipGetErrorString(hipEventSynchronize(mEnd));
-  }
-  float GetElapsedTime() const {
-    float time;
-    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
-    return time;
-  }
-  hipEvent_t mStart, mEnd;
-};
-// >>> hack end
-
-"""
-)
-
-FUNC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <random>
-#include <rocrand/rocrand.h>
-#include "include/ck/utility/print.hpp"
-#include "library/include/ck/library/utility/device_memory.hpp"
-#include "library/include/ck/library/utility/host_tensor.hpp"
-#include "library/include/ck/library/utility/host_tensor_generator.hpp"
-#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "include/ck/utility/reduction_operator.hpp"
-{{extra_headers}}
-
-{{extra_code}}
-
-{{instances_decl}}
-
-{{func_signature}}
-{
-{{shape_eval}}
-{{exec_paths}}
-}
-    """
-)
-
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}   {{input}},
-{{indent}}   {{output}},
-{% for name in input_dim_names %}
-{{indent}}    const_cast<int64_t *>(&{{name}}),
-{% endfor %}
-{{indent}}   stream
-{{indent}});
-    """
-)
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-{{op_func}}
-
-{{structs_def}}
-
-int main(int argc, char** argv) {
-  {{args_parse}}
-  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
-  hipStream_t stream = nullptr;
-  {{tensor_decl}}
-  // warmup
-  for(int i = 0; i < 3; ++i) {
-    {{func_call}}
-  }
-  // run
-  KernelTimerImpl timer;
-  timer.Start();
-  for(int i = 0; i < 5; ++i) {
-    {{func_call}}
-  }
-  timer.End();
-  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
-  std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
-}
-"""
-)
-
-# rendering (messy, need to modularize and organize)
-
-# def gen_profiler(
-#     shape_eval_template: jinja2.Template,
-#     exec_template: jinja2.Template,
-#     tensor_decl_template: jinja2.Template,
-#     extra_header_template: jinja2.Template,
-#     get_func_signature: Any,
-#     extra_code: str = "",
-#     func_call_template: jinja2.Template = FUNC_CALL_TEMPLATE,
-#     indent: str = "  ",
-# ) -> str:
-    # shape_eval_template: jinja2.Template
-    # exec_template: jinja2.Template
-    # tensor_decl_template: jinja2.Template
-#extra_header_template: jinja2.Template
-get_func_signature: Any
-extra_code: str = ""
-func_call_template: jinja2.Template = FUNC_CALL_TEMPLATE
-indent: str = "  "
-
-    # shape_eval = shape_eval_template.render(rank=2) #if shape_eval_template else ""
-    # exe_path = exec_template.render(instance="DeviceInstance",dtype="void",reduce_dims=1,rank=2,eps=eps,)
-
-instances = INSTANCE_TEMPLATE.render(
-            name="DeviceInstance", config_name= "ck::tensor_operation::device::DeviceLayernormImpl",)
-
-op_func = FUNC_TEMPLATE.render(
-            instances_decl=instances,
-            #func_signature=get_func_signature(func_attrs),
-            #shape_eval=shape_eval,
-            #exec_paths=exe_path,
-            #extra_headers=extra_header_template.render(),
-            extra_code=extra_code,)
-
-structs_def = STRUCTS_DEF_TEMPLATE.render()
-args_parse = ARGS_PARSE_TEMPLATE.render(rank=2)
-#tensor_decl = tensor_decl_template.render(rank=2)
-
-input_dim_names = [f"in_{i}" for i in range(2)]
-func_call = func_call_template.render(
-            func_name="norm",
-            input="(void *) memory_pool->RequestHalfTensorByIdx(0)",
-            gamma="(void *) memory_pool->RequestHalfTensorByIdx(2)",
-            beta="(void *) memory_pool->RequestHalfTensorByIdx(3)",
-            output="(void *) memory_pool->RequestHalfTensorByIdx(1)",
-            input_dim_names=input_dim_names,
-            indent=indent,
-)
-
-code = PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            structs_def=structs_def,
-            args_parse=args_parse,
-            #tensor_decl=tensor_decl,
-            func_call=func_call,
-)
-
-# print(instances)
-# print(args_parse)
-# print(structs_def)
-#print(func_call)
-#print(op_func)
-print(code)
-
-
--- a/python/ait_impl/generation/permute_ex.py
+++ b/python/ait_impl/generation/permute_ex.py
-import jinja2
-
-EXTRA_SHAPE_TEMPLATE = jinja2.Template(
-    """
-{{indent}}const int64_t stride_a = *a_dim1;
-{{indent}}const int64_t stride_b = *b_dim1;
-{{indent}}const int64_t stride_c = *c_dim1;
-    ck::index_t M0 = M / G1 / G2;
-    ck::index_t M1 = G1;
-    ck::index_t M2 = G2;
-    ck::index_t N0 = G3;
-    ck::index_t N1 = N / G3;
-    // GEMM shape
-    //ck::index_t M = M0 * M1 * M2;
-    //ck::index_t N = N0 * N1;
-    //ck::index_t K = 128;
-    //ck::index_t stride_A = K;
-    //ck::index_t stride_B = K;
-    // E = [M0, N0, M1, N1, M2]
-    /* 0, 3, 1, 4, 2
-    ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
-    ck::index_t stride_E_M1 = N1 * M2;
-    ck::index_t stride_E_M2 = 1;
-    ck::index_t stride_E_N0 = M1 * N1 * M2;
-    ck::index_t stride_E_N1 = M2;
-    */
-    // E = [M2, M0, N0, M1, N1] 2, 0, 3, 1, 4
-    ck::index_t stride_E_M0 = N0* M1* N1;
-    ck::index_t stride_E_M1 = N1;
-    ck::index_t stride_E_M2 = M0* N0* M1* N1;
-    ck::index_t stride_E_N0 = M1 * N1;
-    ck::index_t stride_E_N1 = 1;
-    // D = [0, N0, 0, N1, 0]
-    ck::index_t stride_D_M0 = 0;
-    ck::index_t stride_D_M1 = 0;
-    ck::index_t stride_D_M2 = 0;
-    ck::index_t stride_D_N0 = N1;
-    ck::index_t stride_D_N1 = 1;
-"""
-)
-
-output = EXTRA_SHAPE_TEMPLATE.render(indent=" ");
-print (output)
\ No newline at end of file
--- a/python/ait_impl/generation/xx.cpp
+++ b/python/ait_impl/generation/xx.cpp
-
-        #pragma once
-
-        #include <iostream>
-        #include <sstream>
-
-        #include "ck/utility/common_header.hpp"
-        #include "ck/tensor_description/tensor_descriptor.hpp"
-        #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-        #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-        #include "ck/tensor_operation/gpu/device/device_gemm.hpp"
-        #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-        #include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
-        #include "ck/host_utility/device_prop.hpp"
-        #include "ck/host_utility/kernel_launch.hpp"
-        namespace ck {
-        namespace tensor_operation {
-        namespace device {
-
-        template <
-            typename ck::half_t,
-            typename ck::half_t,
-            typename ck::half_t,
-            typename float,
-            typename ck::tensor_layout::gemm::ColMajor,
-            typename ck::tensor_layout::gemm::RowMajor,
-            typename ck::tensor_layout::gemm::RowMajor,
-            typename ck::tensor_operation::element_wise::PassThrough,
-            typename ck::tensor_operation::element_wise::PassThrough,
-            typename ck::tensor_operation::element_wise::PassThrough,
-            ck::tensor_operation::device::GemmSpecialization::Default,
-            256,
-            128,
-            128,
-            16,
-            2,
-            4,
-            4,
-            1,
-            typename S<8, 2>,
-            typename S<8, 2>,
-            typename S<2, 1, 4, 2>,
-            typename S<8, 1,  32, 1>,
-            typename S<0, 3, 1, 2>,
-            typename S<0, 3, 1, 2>,
-            typename S<1, 1, 4, 1>,
-            typename S<0, 3, 1, 2>,
-            typename S<1, 1, 4, 2>,
-            typename S<2, 1, 4, 2>,
-            typename S<8, 1, 32, 1>,
-            typename S<0, 3, 1, 2>,
-            typename S<0, 3, 1, 2>,
-            typename S<1, 1, 4, 1>,
-            typename S<0, 3, 1, 2>,
-            typename S<1, 1, 4, 2>,
-            typename S<0, 1, 2, 3, 4, 5>,
-            5,
-            4>
-        struct DeviceGemmDl : public DeviceGemm<ck::tensor_layout::gemm::ColMajor,
-                                                ck::tensor_layout::gemm::RowMajor,
-                                                ck::tensor_layout::gemm::RowMajor,
-                                                ck::half_t,
-                                                ck::half_t,
-                                                ck::half_t,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::PassThrough>
-
-        {
-            static constexpr auto I0 = Number<0>{};
-            static constexpr auto I1 = Number<1>{};
-            static constexpr auto I2 = Number<2>{};
-            static constexpr auto I3 = Number<3>{};
-            static constexpr auto I4 = Number<4>{};
-            static constexpr auto I5 = Number<5>{};
-
-            static constexpr auto K1Number = Number<2>{};
-
-            static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
-            {
-                assert(K % 2 == 0);
-
-                const index_t K0 = K / 2;
-
-                const auto a_grid_desc_m_k = [&]() {
-                    if constexpr(is_same<tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColMajor>::value)
-                    {
-                        return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
-                    }
-                    else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColMajor>::value)
-                    {
-                        return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
-                    }
-                }();
-
-                if constexpr(ck::tensor_operation::device::GemmSpecialization::Default == GemmSpecialization::MNPadding)
-                {
-                    const auto PadM = (128 - M % 128) % 128;
-
-                    return transform_tensor_descriptor(
-                        a_grid_desc_m_k,
-                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                    make_right_pad_transform(M, PadM)),
-                        make_tuple(Sequence<1>{}, Sequence<0>{}),
-                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-                }
-                else
-                {
-                    return transform_tensor_descriptor(
-                        a_grid_desc_m_k,
-                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                    make_pass_through_transform(M)),
-                        make_tuple(Sequence<1>{}, Sequence<0>{}),
-                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-                }
-            }
-
-            static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
-    {
-        assert(K % 2 == 0);
-
-        const index_t K0 = K / 2;
-
-        const auto b_grid_desc_k_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ${layout_B}>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
-            }
-        }();
-
-        if constexpr(ck::tensor_operation::device::GemmSpecialization::Default == GemmSpecialization::MNPadding)
-        {
-            const auto PadN = (128 - N % 128) % 128;
-
-            return transform_tensor_descriptor(
-                b_grid_desc_k_n,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-        else
-        {
-            return transform_tensor_descriptor(
-                b_grid_desc_k_n,
-                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-        }
-    }
-
-    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
-    {
-        const auto c_grid_desc_m_n = [&]() {
-            if constexpr(is_same<tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
-            }
-            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor>::value)
-            {
-                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
-            }
-        }();
-
-        if constexpr(ck::tensor_operation::device::GemmSpecialization::Default == GemmSpecialization::MNPadding)
-        {
-            const auto PadM = (128 - M % 128) % 128;
-            const auto PadN = (128 - N % 128) % 128;
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-
-            return transform_tensor_descriptor(
-                c_grid_desc_m_n,
-                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-    }
-
-    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
-    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
-    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
-
-        // GridwiseGemm
-    using GridwiseGemm =
-        GridwiseGemmDl_km_kn_mn_v1r3<BlockSize,
-                                     ck::half_t,
-                                     float,
-                                     ck::half_t,
-                                     InMemoryDataOperationEnum::Set,
-                                     AGridDesc_K0_M_K1,
-                                     BGridDesc_K0_N_K1,
-                                     CGridDesc_M_N,
-                                     128,
-                                     128,
-                                     16,
-                                     2,
-                                     4,
-                                     4,
-                                     1,
-                                     S<8, 2>,
-                                     S<8, 2>,
-                                     S<2, 1, 4, 2>,
-                                     S<8, 1,  32, 1>,
-                                     S<0, 3, 1, 2>,
-                                     S<0, 3, 1, 2>,
-                                     S<1, 1, 4, 1>,
-                                     S<0, 3, 1, 2>,
-                                     S<1, 1, 4, 2>,
-                                     S<2, 1, 4, 2>,
-                                     S<8, 1, 32, 1>,
-                                     S<0, 3, 1, 2>,
-                                     S<0, 3, 1, 2>,
-                                     S<1, 1, 4, 1>,
-                                     S<0, 3, 1, 2>,
-                                     S<1, 1, 4, 2>,
-                                     S<0, 1, 2, 3, 4, 5>,
-                                     5,
-                                     4>;
-
-    using AGridDesc_K0_M0_M1_K1 =
-        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
-    using BGridDesc_K0_N0_N1_K1 =
-        decltype(GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{}));
-    using CGridDesc_M0_M10_M11_N0_N10_N11 =
-        decltype(GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{}));
-    using DefaultBlock2CTileMap =
-        decltype(GridwiseGemm::MakeDefaultBlock2CTileMap(CGridDesc_M_N{}));
-
-    // Argument
-    struct Argument : public BaseArgument
-    {
-        Argument(const ck::half_t* p_a_grid,
-                 const ck::half_t* p_b_grid,
-                 ck::half_t* p_c_grid,
-                 index_t M,
-                 index_t N,
-                 index_t K,
-                 index_t StrideA,
-                 index_t StrideB,
-                 index_t StrideC,
-                 index_t M01,
-                 index_t N01,
-                 ck::tensor_operation::element_wise::PassThrough a_element_op,
-                 ck::tensor_operation::element_wise::PassThrough b_element_op,
-                 ck::tensor_operation::element_wise::PassThrough c_element_op)
-            : p_a_grid_{p_a_grid},
-              p_b_grid_{p_b_grid},
-              p_c_grid_{p_c_grid},
-              a_grid_desc_k0_m0_m1_k1_{},
-              b_grid_desc_k0_n0_n1_k1_{},
-              c_grid_desc_m0_m10_m11_n0_n10_n11_{},
-              block_2_ctile_map_{},
-              M01_{M01},
-              N01_{N01},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              c_element_op_{c_element_op}
-        {
-            a_grid_desc_k0_m_k1_ = DeviceGemmDl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
-            b_grid_desc_k0_n_k1_ = DeviceGemmDl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
-            c_grid_desc_m_n_     = DeviceGemmDl::MakeCGridDescriptor_M_N(M, N, StrideC);
-
-            if(GridwiseGemm::CheckValidity(
-                   a_grid_desc_k0_m_k1_, b_grid_desc_k0_n_k1_, c_grid_desc_m_n_))
-            {
-                a_grid_desc_k0_m0_m1_k1_ =
-                    GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(a_grid_desc_k0_m_k1_);
-                b_grid_desc_k0_n0_n1_k1_ =
-                    GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(b_grid_desc_k0_n_k1_);
-                c_grid_desc_m0_m10_m11_n0_n10_n11_ =
-                    GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(c_grid_desc_m_n_);
-
-                block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_);
-            }
-        }
-
-        //  private:
-        const ck::half_t* p_a_grid_;
-        const ck::half_t* p_b_grid_;
-        ck::half_t* p_c_grid_;
-
-        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
-        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
-        CGridDesc_M_N c_grid_desc_m_n_;
-
-        AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1_;
-        BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1_;
-        CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11_;
-
-        DefaultBlock2CTileMap block_2_ctile_map_;
-
-        // TODO: unused, but may be useful in future.
-        index_t M01_;
-        index_t N01_;
-
-        // TODO: unused since gridwise_gemm_dl_v1r3 does NOT support prologue for the time being.
-        ck::tensor_operation::element_wise::PassThrough a_element_op_;
-        ck::tensor_operation::element_wise::PassThrough b_element_op_;
-        ck::tensor_operation::element_wise::PassThrough c_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public BaseInvoker
-    {
-        using Argument = DeviceGemmDl::Argument;
-
-        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
-        {
-            {
-                std::cout << "arg.a_grid_desc_k0_m0_m1_k1_{"
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_k0_n0_n1_k1_{"
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I0) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
-
-                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
-                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
-            }
-
-            if(!GridwiseGemm::CheckValidity(
-                   arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_))
-            {
-                throw std::runtime_error(
-                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdl_v2r3 has invalid setting");
-            }
-
-            const index_t grid_size = GridwiseGemm::CalculateGridSize(
-                arg.c_grid_desc_m_n_.GetLength(I0), arg.c_grid_desc_m_n_.GetLength(I1));
-
-            const auto K0                    = arg.a_grid_desc_k0_m0_m1_k1_.GetLength(I0);
-            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
-            const bool has_double_tail_k_block_loop =
-                GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
-
-            float ave_time = 0;
-
-            if(has_main_k_block_loop && has_double_tail_k_block_loop)
-            {
-                const auto kernel =
-                    kernel_gemm_dl_v1r3<GridwiseGemm,
-                                        ck::half_t,
-                                        ck::half_t,
-                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
-                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
-                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
-                                        remove_reference_t<DefaultBlock2CTileMap>,
-                                        true,
-                                        true>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(256),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m0_m1_k1_,
-                                                  arg.b_grid_desc_k0_n0_n1_k1_,
-                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
-                                                  arg.block_2_ctile_map_);
-            }
-            else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
-            {
-                const auto kernel =
-                    kernel_gemm_dl_v1r3<GridwiseGemm,
-                                        ck::half_t,
-                                        ck::half_t,
-                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
-                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
-                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
-                                        remove_reference_t<DefaultBlock2CTileMap>,
-                                        true,
-                                        false>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(256),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m0_m1_k1_,
-                                                  arg.b_grid_desc_k0_n0_n1_k1_,
-                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
-                                                  arg.block_2_ctile_map_);
-            }
-            else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
-            {
-                const auto kernel =
-                    kernel_gemm_dl_v1r3<GridwiseGemm,
-                                        ck::half_t,
-                                        ck::half_t,
-                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
-                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
-                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
-                                        remove_reference_t<DefaultBlock2CTileMap>,
-                                        false,
-                                        true>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(256),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m0_m1_k1_,
-                                                  arg.b_grid_desc_k0_n0_n1_k1_,
-                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
-                                                  arg.block_2_ctile_map_);
-            }
-            else
-            {
-                const auto kernel =
-                    kernel_gemm_dl_v1r3<GridwiseGemm,
-                                        ck::half_t,
-                                        ck::half_t,
-                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
-                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
-                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
-                                        remove_reference_t<DefaultBlock2CTileMap>,
-                                        false,
-                                        false>;
-
-                ave_time = launch_and_time_kernel(stream_config,
-                                                  kernel,
-                                                  dim3(grid_size),
-                                                  dim3(256),
-                                                  0,
-                                                  arg.p_a_grid_,
-                                                  arg.p_b_grid_,
-                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_k0_m0_m1_k1_,
-                                                  arg.b_grid_desc_k0_n0_n1_k1_,
-                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
-                                                  arg.block_2_ctile_map_);
-            }
-
-            return ave_time;
-        }
-
-        // polymorphic
-        float Run(const BaseArgument* p_arg,
-                  const StreamConfig& stream_config = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030")
-        {
-            return GridwiseGemm::CheckValidity(
-                arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
-        }
-        else
-        {
-            return false;
-        }
-    }
-
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-
-    static auto MakeArgument(const ck::half_t* p_a,
-                             const ck::half_t* p_b,
-                             ck::half_t* p_c,
-                             index_t M,
-                             index_t N,
-                             index_t K,
-                             index_t StrideA,
-                             index_t StrideB,
-                             index_t StrideC,
-                             ck::tensor_operation::element_wise::PassThrough a_element_op,
-                             ck::tensor_operation::element_wise::PassThrough b_element_op,
-                             ck::tensor_operation::element_wise::PassThrough c_element_op)
-    {
-        return Argument{p_a,
-                        p_b,
-                        p_c,
-                        M,
-                        N,
-                        K,
-                        StrideA,
-                        StrideB,
-                        StrideC,
-                        1,
-                        1,
-                        a_element_op,
-                        b_element_op,
-                        c_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    // polymorphic
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
-                                                      const void* p_b,
-                                                      void* p_c,
-                                                      index_t M,
-                                                      index_t N,
-                                                      index_t K,
-                                                      index_t StrideA,
-                                                      index_t StrideB,
-                                                      index_t StrideC,
-                                                      ck::tensor_operation::element_wise::PassThrough a_element_op,
-                                                      ck::tensor_operation::element_wise::PassThrough b_element_op,
-                                                      ck::tensor_operation::element_wise::PassThrough c_element_op) override
-    {
-        return std::make_unique<Argument>(static_cast<const ck::half_t*>(p_a),
-                                          static_cast<const ck::half_t*>(p_b),
-                                          static_cast<ck::half_t*>(p_c),
-                                          M,
-                                          N,
-                                          K,
-                                          StrideA,
-                                          StrideB,
-                                          StrideC,
-                                          1,
-                                          1,
-                                          a_element_op,
-                                          b_element_op,
-                                          c_element_op);
-    }
-
-    // polymorphic
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    // polymorphic
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "DeviceGemmDl"
-            << "<"
-            << 256 << ", "
-            << 128 << ", "
-            << 128 << ", "
-            << 16 << ", "
-            << 2 << ", "
-            << 4 << ", "
-            << 4 << ", "
-            << 1
-            << ">";
-        // clang-format on
-
-        return str.str();
-    }
-};
-
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-    
\ No newline at end of file