gemm example: compiles into normal executable and .so

f945f40a · Astha Rai · 3af15212 · f945f40a · 3af15212 · f945f40a
Commit f945f40a authored Apr 20, 2023 by Astha Rai
19 changed files
--- a/python/ait_impl/generation/Makefile
+++ b/python/ait_impl/generation/Makefile
 gemm: xx.o

-CFLAGS=-I ~/rocm/composable_kernel/include -I /opt/rocm-5.1.1/hip/include -I ~/rocm/composable_kernel/include/ -I ~/rocm/composable_kernel/include/ck/ -I ~/rocm/composable_kernel/include/ck/problem_transform/ -I ~/rocm/composable_kernel/include/ck/tensor/ -I ~/rocm/composable_kernel/include/ck/tensor_description/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/rocm/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/rocm/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/rocm/composable_kernel/library/include/ck/library/host/ -I ~/rocm/composable_kernel/library/include/ck/library/host_tensor/ -I ~/rocm/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/rocm/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/rocm/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/rocm/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/rocm/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/rocm/composable_kernel/library/include/ck/library/tensor_op/ -I ~/rocm/composable_kernel/library/include/ck/library/utility/ -I ~/rocm/composable_kernel/profiler/include/
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 

 CXXFLAGS = -std=c++17

 xx.o: 
-	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm/rocrand -lrocrand -x hip -c -o xx.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  xx.cpp
--- a/python/ait_impl/generation/demo.py
+++ b/python/ait_impl/generation/demo.py
-import jinja2
-
-SHAPE_EVAL_TEMPLATE = jinja2.Template(
-    """
-    int M = *in_{{ range(rank - 1)|join(' * *in_') }};
-    int N = *in_{{rank - 1}};
-    static constexpr auto I0 = Number<0>{};
-            static constexpr auto I1 = Number<1>{};
-            static constexpr auto I2 = Number<2>{};
-            static constexpr auto I3 = Number<3>{};
-            static constexpr auto I4 = Number<4>{};
-            static constexpr auto I5 = Number<5>{};
-
-            static constexpr auto K1Number = Number<K1>{};
-    """
-)
-
-output = SHAPE_EVAL_TEMPLATE.render(rank=2);
-print (output)
\ No newline at end of file
--- a/python/ait_impl/generation/ex/Makefile
+++ b/python/ait_impl/generation/ex/Makefile
+
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
+    
\ No newline at end of file
--- a/python/ait_impl/generation/ex/ex.cpp
+++ b/python/ait_impl/generation/ex/ex.cpp
+
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
+            ck::half_t,
+            ck::half_t,
+            ck::half_t,
+            float,
+            ck::tensor_layout::gemm::ColumnMajor,
+            ck::tensor_layout::gemm::RowMajor,
+            ck::tensor_layout::gemm::RowMajor,
+            ck::tensor_operation::element_wise::PassThrough,
+            ck::tensor_operation::element_wise::PassThrough,
+            ck::tensor_operation::element_wise::PassThrough,
+            ck::tensor_operation::device::GemmSpecialization::Default,
+            256,
+            128,
+            128,
+            16,
+            2,
+            4,
+            4,
+            1,
+            S<8, 2>,
+            S<8, 2>,
+            S<2, 1, 4, 2>,
+            S<8, 1,  32, 1>,
+            S<0, 3, 1, 2>,
+            S<0, 3, 1, 2>,
+            S<1, 1, 4, 1>,
+            S<0, 3, 1, 2>,
+            S<1, 1, 4, 2>,
+            S<2, 1, 4, 2>,
+            S<8, 1, 32, 1>,
+            S<0, 3, 1, 2>,
+            S<0, 3, 1, 2>,
+            S<1, 1, 4, 1>,
+            S<0, 3, 1, 2>,
+            S<1, 1, 4, 2>,
+            S<0, 1, 2, 3, 4, 5>,
+            5,
+            4>;
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ck::half_t> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ck::tensor_layout::gemm::ColumnMajor{}));
+    Tensor<ck::half_t> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ck::tensor_layout::gemm::RowMajor{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ck::half_t>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<ck::half_t>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ck::half_t>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<ck::half_t>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<ck::half_t> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<ck::half_t> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ck::half_t) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(ck::half_t) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(ck::half_t) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = ck::tensor_operation::element_wise::PassThrough{};
+    auto b_element_op = ck::tensor_operation::element_wise::PassThrough{};
+    auto c_element_op = ck::tensor_operation::element_wise::PassThrough{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<ck::half_t*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<ck::half_t*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<ck::half_t*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(ck::half_t) * M * K + sizeof(ck::half_t) * K * N + sizeof(ck::half_t) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/python/ait_impl/generation/ex/gemm_ex.py
+++ b/python/ait_impl/generation/ex/gemm_ex.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import subprocess
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.make_template =     """
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
+    """
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
+            ${type_a},
+            ${type_b},
+            ${type_c},
+            ${type_acc},
+            ${layout_a},
+            ${layout_b},
+            ${layout_c},
+            ${elementwise_op_a},
+            ${elementwise_op_b},
+            ${elementwise_op_c},
+            ${Gemm_spec},
+            ${block_size},
+            ${mperblock},
+            ${nperblock},
+            ${k0perblock},
+            ${k1},
+            ${m1perthread},
+            ${n1perthread},
+            ${kperthread},
+            ${m1n1_thcluster_m1xs},
+            ${m1n1_thcluster_n1xs},
+            ${ABT_thread_slice_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_arrange_order},
+            ${ABT_src_access_order},
+            ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
+            ${ABT_src_vec_tensor_cont_dim_order},
+            ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
+            ${BBT_thread_slice_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_arrange_order},
+            ${BBT_src_access_order},
+            ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
+            ${BBT_src_vec_tensor_cont_dim_order},
+            ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
+            ${CTT_src_dst_access_order},
+            ${CTT_src_dst_vec_dim},
+            ${CTT_dst_scalar_per_vector}>;
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
+    Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = ${elementwise_op_a}{};
+    auto b_element_op = ${elementwise_op_b}{};
+    auto c_element_op = ${elementwise_op_c}{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+            'type_b' : 'ck::half_t',
+            'type_c' : 'ck::half_t',
+            'type_acc' : 'float',
+            'layout_a' : 'ck::tensor_layout::gemm::ColumnMajor',
+            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
+            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
+            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
+            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::Default',
+            'block_size' : '256',
+            'mperblock' : '128',
+            'nperblock' : '128',
+            'k0perblock' : '16',
+            'k1' : '2',
+            'm1perthread' : '4',
+            'n1perthread' : '4',
+            'kperthread' : '1',
+            'm1n1_thcluster_m1xs' : 'S<8, 2>',
+            'm1n1_thcluster_n1xs' : 'S<8, 2>',
+            'ABT_thread_slice_lengths_K0_M0_M1_K1' : 'S<2, 1, 4, 2>',
+            'ABT_thread_cluster_lengths_K0_M0_M1_K1' : 'S<8, 1,  32, 1>',
+            'ABT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_access_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 1>',
+            'ABT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 2>',
+            'BBT_thread_slice_lengths_K0_N0_N1_K1' : 'S<2, 1, 4, 2>',
+            'BBT_thread_cluster_lengths_K0_N0_N1_K1' : 'S<8, 1, 32, 1>',
+            'BBT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_access_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : 'S<1, 1, 4, 1>',
+            'BBT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': 'S<1, 1, 4, 2>',
+            'CTT_src_dst_access_order' : 'S<0, 1, 2, 3, 4, 5>',
+            'CTT_src_dst_vec_dim' : '5',
+            'CTT_dst_scalar_per_vector' : '4'
+        }
+        template = self.gemm_devop_template
+        cf = open("ex.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+        m_template = self.make_template
+        cf = open("Makefile", 'w')
+        print(SubstituteTemplate(m_template, values))
+        cf.write(SubstituteTemplate(m_template, values))
+        cf.close()
+
+        PIPE = -1
+        STDOUT = -2
+
+        proc = subprocess.Popen(
+        ["make"],
+        shell=True,
+        env=os.environ.copy(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        )
+
+        out, err = proc.communicate()
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/normal/Makefile
+++ b/python/ait_impl/generation/ex/normal/Makefile
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
--- a/python/ait_impl/generation/ex/normal/gemm_ex.py
+++ b/python/ait_impl/generation/ex/normal/gemm_ex.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import subprocess
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.make_template =     """
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
+    """
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
+            ${type_a},
+            ${type_b},
+            ${type_c},
+            ${type_acc},
+            ${layout_a},
+            ${layout_b},
+            ${layout_c},
+            ${elementwise_op_a},
+            ${elementwise_op_b},
+            ${elementwise_op_c},
+            ${Gemm_spec},
+            ${block_size},
+            ${mperblock},
+            ${nperblock},
+            ${k0perblock},
+            ${k1},
+            ${m1perthread},
+            ${n1perthread},
+            ${kperthread},
+            ${m1n1_thcluster_m1xs},
+            ${m1n1_thcluster_n1xs},
+            ${ABT_thread_slice_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_arrange_order},
+            ${ABT_src_access_order},
+            ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
+            ${ABT_src_vec_tensor_cont_dim_order},
+            ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
+            ${BBT_thread_slice_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_arrange_order},
+            ${BBT_src_access_order},
+            ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
+            ${BBT_src_vec_tensor_cont_dim_order},
+            ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
+            ${CTT_src_dst_access_order},
+            ${CTT_src_dst_vec_dim},
+            ${CTT_dst_scalar_per_vector}>;
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
+    Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = ${elementwise_op_a}{};
+    auto b_element_op = ${elementwise_op_b}{};
+    auto c_element_op = ${elementwise_op_c}{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+            'type_b' : 'ck::half_t',
+            'type_c' : 'ck::half_t',
+            'type_acc' : 'float',
+            'layout_a' : 'ck::tensor_layout::gemm::ColumnMajor',
+            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
+            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
+            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
+            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::Default',
+            'block_size' : '256',
+            'mperblock' : '128',
+            'nperblock' : '128',
+            'k0perblock' : '16',
+            'k1' : '2',
+            'm1perthread' : '4',
+            'n1perthread' : '4',
+            'kperthread' : '1',
+            'm1n1_thcluster_m1xs' : 'S<8, 2>',
+            'm1n1_thcluster_n1xs' : 'S<8, 2>',
+            'ABT_thread_slice_lengths_K0_M0_M1_K1' : 'S<2, 1, 4, 2>',
+            'ABT_thread_cluster_lengths_K0_M0_M1_K1' : 'S<8, 1,  32, 1>',
+            'ABT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_access_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 1>',
+            'ABT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 2>',
+            'BBT_thread_slice_lengths_K0_N0_N1_K1' : 'S<2, 1, 4, 2>',
+            'BBT_thread_cluster_lengths_K0_N0_N1_K1' : 'S<8, 1, 32, 1>',
+            'BBT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_access_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : 'S<1, 1, 4, 1>',
+            'BBT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': 'S<1, 1, 4, 2>',
+            'CTT_src_dst_access_order' : 'S<0, 1, 2, 3, 4, 5>',
+            'CTT_src_dst_vec_dim' : '5',
+            'CTT_dst_scalar_per_vector' : '4'
+        }
+        template = self.gemm_devop_template
+        cf = open("ex.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+        m_template = self.make_template
+        cf = open("Makefile", 'w')
+        print(SubstituteTemplate(m_template, values))
+        cf.write(SubstituteTemplate(m_template, values))
+        cf.close()
+
+        PIPE = -1
+        STDOUT = -2
+
+        proc = subprocess.Popen(
+        ["make"],
+        shell=True,
+        env=os.environ.copy(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        )
+
+        out, err = proc.communicate()
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/normal/gemm_ex_code.py
+++ b/python/ait_impl/generation/ex/normal/gemm_ex_code.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
+         <ADataType, 
+         BDataType, 
+         CDataType, 
+         AccDataType, 
+         ALayout, 
+         BLayout, 
+         CLayout,  
+         AElementOp,  
+         BElementOp,  
+         CElementOp,    
+         GemmDefault,   
+         256,   
+         128,   
+         128,    
+         16,  
+         2,          
+         4,          
+         4,      
+         1,       
+         S<8, 2>,       
+         S<8, 2>,      
+         S<2, 1, 4, 2>,      
+         S<8, 1,  32, 1>,  
+         S<0, 3, 1, 2>,  
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 1>,      
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 2>,      
+         S<2, 1, 4, 2>,       
+         S<8, 1, 32, 1>,  
+         S<0, 3, 1, 2>,  
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 1>,      
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 2>, 
+         S<0, 1, 2, 3, 4, 5>,               
+         5,                  
+         4>;
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+        }
+        template = self.gemm_devop_template
+        cf = open("xx.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/normal/input.py
+++ b/python/ait_impl/generation/ex/normal/input.py
+#take in input for gemm from user, send it to example template
\ No newline at end of file
--- a/python/ait_impl/generation/ex/potential/Makefile
+++ b/python/ait_impl/generation/ex/potential/Makefile
+CC = /opt/rocm/bin/hipcc
+CK_PATH=/dockerx/composable_kernel/
+CFLAGS = -O3 -std=c++17 -DCK_AMD_GPU_GFX90A --offload-arch=gfx90a -I"${CK_PATH}/include" -I"${CK_PATH}/library/include" -I"${CK_PATH}/profiler/include"
+OBJS = ex.o host_tensor.o device_memory.o
+
+all: $(OBJS)
+	$(CC) $(CFLAGS) $(OBJS) -o ex
+
+device_memory.o: ../../library/src/utility/device_memory.cpp
+	$(CC) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../library/src/utility/host_tensor.cpp
+	$(CC) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp
\ No newline at end of file
--- a/python/ait_impl/generation/ex/shared/Makefile
+++ b/python/ait_impl/generation/ex/shared/Makefile
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+test.so: ex.o host_tensor.o device_memory.o
+	hipcc -shared $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o test.so
+
+device_memory.o: ../../../../../library/src/utility/device_memory.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
--- a/python/ait_impl/generation/ex/shared/gemm_ex.py
+++ b/python/ait_impl/generation/ex/shared/gemm_ex.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import subprocess
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.make_template =     """
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+
+CXXFLAGS = -std=c++17
+gemm: ex.o host_tensor.o device_memory.o
+	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
+
+device_memory.o: ../../../../library/src/utility/device_memory.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
+	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
+
+ex.o: 
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
+    """
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
+            ${type_a},
+            ${type_b},
+            ${type_c},
+            ${type_acc},
+            ${layout_a},
+            ${layout_b},
+            ${layout_c},
+            ${elementwise_op_a},
+            ${elementwise_op_b},
+            ${elementwise_op_c},
+            ${Gemm_spec},
+            ${block_size},
+            ${mperblock},
+            ${nperblock},
+            ${k0perblock},
+            ${k1},
+            ${m1perthread},
+            ${n1perthread},
+            ${kperthread},
+            ${m1n1_thcluster_m1xs},
+            ${m1n1_thcluster_n1xs},
+            ${ABT_thread_slice_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_arrange_order},
+            ${ABT_src_access_order},
+            ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
+            ${ABT_src_vec_tensor_cont_dim_order},
+            ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
+            ${BBT_thread_slice_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_arrange_order},
+            ${BBT_src_access_order},
+            ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
+            ${BBT_src_vec_tensor_cont_dim_order},
+            ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
+            ${CTT_src_dst_access_order},
+            ${CTT_src_dst_vec_dim},
+            ${CTT_dst_scalar_per_vector}>;
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
+    Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = ${elementwise_op_a}{};
+    auto b_element_op = ${elementwise_op_b}{};
+    auto c_element_op = ${elementwise_op_c}{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+            'type_b' : 'ck::half_t',
+            'type_c' : 'ck::half_t',
+            'type_acc' : 'float',
+            'layout_a' : 'ck::tensor_layout::gemm::ColumnMajor',
+            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
+            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
+            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
+            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
+            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::Default',
+            'block_size' : '256',
+            'mperblock' : '128',
+            'nperblock' : '128',
+            'k0perblock' : '16',
+            'k1' : '2',
+            'm1perthread' : '4',
+            'n1perthread' : '4',
+            'kperthread' : '1',
+            'm1n1_thcluster_m1xs' : 'S<8, 2>',
+            'm1n1_thcluster_n1xs' : 'S<8, 2>',
+            'ABT_thread_slice_lengths_K0_M0_M1_K1' : 'S<2, 1, 4, 2>',
+            'ABT_thread_cluster_lengths_K0_M0_M1_K1' : 'S<8, 1,  32, 1>',
+            'ABT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_access_order' : 'S<0, 3, 1, 2>',
+            'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 1>',
+            'ABT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 2>',
+            'BBT_thread_slice_lengths_K0_N0_N1_K1' : 'S<2, 1, 4, 2>',
+            'BBT_thread_cluster_lengths_K0_N0_N1_K1' : 'S<8, 1, 32, 1>',
+            'BBT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_access_order' : 'S<0, 3, 1, 2>',
+            'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : 'S<1, 1, 4, 1>',
+            'BBT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
+            'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': 'S<1, 1, 4, 2>',
+            'CTT_src_dst_access_order' : 'S<0, 1, 2, 3, 4, 5>',
+            'CTT_src_dst_vec_dim' : '5',
+            'CTT_dst_scalar_per_vector' : '4'
+        }
+        template = self.gemm_devop_template
+        cf = open("ex.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+        m_template = self.make_template
+        cf = open("Makefile", 'w')
+        print(SubstituteTemplate(m_template, values))
+        cf.write(SubstituteTemplate(m_template, values))
+        cf.close()
+
+        PIPE = -1
+        STDOUT = -2
+
+        proc = subprocess.Popen(
+        ["make"],
+        shell=True,
+        env=os.environ.copy(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        )
+
+        out, err = proc.communicate()
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/shared/gemm_ex_code.py
+++ b/python/ait_impl/generation/ex/shared/gemm_ex_code.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import re
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+class EmitGemmInstance:
+    def __init__(self):
+        self.gemm_devop_template =     """
+#pragma once
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
+         <ADataType, 
+         BDataType, 
+         CDataType, 
+         AccDataType, 
+         ALayout, 
+         BLayout, 
+         CLayout,  
+         AElementOp,  
+         BElementOp,  
+         CElementOp,    
+         GemmDefault,   
+         256,   
+         128,   
+         128,    
+         16,  
+         2,          
+         4,          
+         4,      
+         1,       
+         S<8, 2>,       
+         S<8, 2>,      
+         S<2, 1, 4, 2>,      
+         S<8, 1,  32, 1>,  
+         S<0, 3, 1, 2>,  
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 1>,      
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 2>,      
+         S<2, 1, 4, 2>,       
+         S<8, 1, 32, 1>,  
+         S<0, 3, 1, 2>,  
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 1>,      
+         S<0, 3, 1, 2>,       
+         S<1, 1, 4, 2>, 
+         S<0, 1, 2, 3, 4, 5>,               
+         5,                  
+         4>;
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+
+        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+"""
+    def emit(self):
+        values = {
+            'type_a' : 'ck::half_t',
+        }
+        template = self.gemm_devop_template
+        cf = open("xx.cpp", 'w')
+        print(SubstituteTemplate(template, values))
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
+
+
+a = EmitGemmInstance()
+a.emit()
--- a/python/ait_impl/generation/ex/shared/input.py
+++ b/python/ait_impl/generation/ex/shared/input.py
+#take in input for gemm from user, send it to example template
\ No newline at end of file
--- a/python/ait_impl/generation/gemm_dev_op.py
+++ b/python/ait_impl/generation/gemm_dev_op.py
--- a/python/ait_impl/generation/gemm_kernel.py
+++ b/python/ait_impl/generation/gemm_kernel.py
-import enum
-import os.path
-import shutil
-import functools
-import operator
-import collections
-import re
-
-def SubstituteTemplate(template, values):
-    text = template
-    changed = True
-    while changed:
-        changed = False
-        for key, value in values.items():
-            regex = "\\$\\{%s\\}" % key
-            newtext = re.sub(regex, value, text)
-            if newtext != text:
-                changed = True
-            text = newtext
-    return text
-
-
-class EmitGemmInstance:
-    def __init__(self):
-        self.gemm_kernel_template =     """
-        template <typename GridwiseGemm,
-          typename FloatAB,
-          typename FloatC,
-          typename AGridDesc_K0_M0_M1_K1,
-          typename BGridDesc_K0_N0_N1_K1,
-          typename CGridDesc_M0_M10_M11_N0_N10_N11,
-          typename Block2CTileMap,
-          bool HasMainKBlockLoop,
-          bool HasDoubleTailKBlockLoop>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_gemm_dl_v1r3(const ${type_ab}* __restrict__ ${p_a_grid},
-                            const ${type_ab}* __restrict__ ${p_b_grid},
-                            ${type_c}* __restrict__ ${p_c_grid},
-                            const ${A_GridDesc_K0_M_K1} ${a_grid_desc_k0_m0_m1_k1},
-                            const ${BGridDesc_K0_N_K1} ${b_grid_desc_k0_n0_n1_k1},
-                            const ${CGridDesc_M0_M10_M11_N0_N10_N11} ${c_grid_desc_m0_m10_m11_n0_n10_n11},
-                            const Block2CTileMap ${block_2_ctile_map})
-{
-    constexpr index_t shared_block_size =
-        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(${type_ab});
-
-    __shared__ ${type_ab} p_shared_block[shared_block_size];
-
-    GridwiseGemm::Run(${p_a_grid},
-                      ${p_b_grid},
-                      ${p_c_grid},
-                      p_shared_block,
-                      ${a_grid_desc_k0_m0_m1_k1},
-                      ${b_grid_desc_k0_n0_n1_k1},
-                      ${c_grid_desc_m0_m10_m11_n0_n10_n11},
-                      ${block_2_ctile_map},
-                      integral_constant<bool, HasMainKBlockLoop>{},
-                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
-}
-
-template <index_t BlockSize,
-          ${type_ab},
-          ${type_acc},
-          ${type_c},
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          ${A_GridDesc_K0_M_K1},
-          ${BGridDesc_K0_N_K1},
-          ${CGridDesc_M_N},
-          ${mperblock},
-          ${nperblock},
-          ${k0perblock},
-          ${k1value},
-          ${M1PerThreadM111},
-          ${N1PerThreadN111},
-          ${KPerThread},
-          ${M11N11ThreadClusterM110Xs},
-          ${M11N11ThreadClusterN110Xs},
-          ${ABlockTransferThreadSliceLengths_K0_M0_M1_K1},
-          ${ABlockTransferThreadClusterLengths_K0_M0_M1_K1},
-          ${ABlockTransferThreadClusterArrangeOrder},
-          ${ABlockTransferSrcAccessOrder},
-          ${ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1},
-          ${ABlockTransferSrcVectorTensorContiguousDimOrder},
-          ${ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1},
-          ${BBlockTransferThreadSliceLengths_K0_N0_N1_K1},
-          ${BBlockTransferThreadClusterLengths_K0_N0_N1_K1},
-          ${BBlockTransferThreadClusterArrangeOrder},
-          ${BBlockTransferSrcAccessOrder},
-          ${BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1},
-          ${BBlockTransferSrcVectorTensorContiguousDimOrder},
-          ${BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1},
-          ${CThreadTransferSrcDstAccessOrder},
-          ${CThreadTransferSrcDstVectorDim},
-          ${CThreadTransferDstScalarPerVector}>
-          struct GridwiseGemmDl_km_kn_mn_v1r3
-{
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};
-
-    // K1 should be Number<...>
-    static constexpr auto K1 = Number<K1Value>{};
-
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
-    {
-        // TODO: change this. I think it needs multi-dimensional alignment
-        constexpr auto max_lds_align = K1;
-
-        // TODO: check alignment
-        // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_block_desc_k_m = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
-
-        // TODO: check alignment
-        // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_block_desc_k_n = make_naive_tensor_descriptor_aligned(
-            make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
-
-        // TODO: check alignment
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_aligned_space_size =
-            math::integer_least_multiple(a_block_desc_k_m.GetElementSpaceSize(), max_lds_align);
-
-        constexpr auto b_block_aligned_space_size =
-            math::integer_least_multiple(b_block_desc_k_n.GetElementSpaceSize(), max_lds_align);
-
-        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
-    }
-          """
-
-def emit(self):
-        values = {
-            'function_name': "gemm",
-            'type_a' : 'ck::half_t',
-            'type_b' : 'ck::half_t',
-            'type_c' : 'ck::half_t',
-            'type_acc' : 'float',
-            'layout_a' : 'ck::tensor_layout::gemm::RowMajor',
-            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
-            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
-            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
-            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
-            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
-            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::MNKPadding',
-            'block_size' : '256',
-            'mperblock' : '64',
-            'nperblock' : '128',
-            'kperblock' : '32',
-            'k1' : '8',
-            'mperxdl' : '32',
-            'nperxdl' : '32',
-            'mxdlperwave' : '1',
-            'nxdlperwave' : '2',
-            'threadclusterlength_a' : 'ck::Sequence<4,64,1>',
-            'threadclusterarrange_a' : 'ck::Sequence<1,0,2>',
-            'srcaccessorder_a' : 'ck::Sequence<1,0,2>',
-            'srcvectordim_a' : '2',
-            'srcscalarpervec_a' : '8',
-            'dstscalarpervec_a' : '8',
-            'add_extra_dim_a' : '1',
-            'threadclusterlength_b' : 'ck::Sequence<8,32,1>',
-            'threadclusterarrange_b' : 'ck::Sequence<0,2,1>',
-            'srcaccessorder_b' : 'ck::Sequence<0,2,1>',
-            'srcvectordim_b' : '1',
-            'srcscalarpervec_b' : '4',
-            'dstscalarpervec_b' : '2',
-            'add_extra_dim_b' : '0',
-            'dstscalarpervec_c' : '8'
-        }
-        template = self.gemm_template
-        print(SubstituteTemplate(template, values))
\ No newline at end of file
--- a/python/ait_impl/generation/norm_ex.py
+++ b/python/ait_impl/generation/norm_ex.py
-import os
-import re
-from hashlib import sha1
-from typing import Any, Dict, OrderedDict
-
-import jinja2
-
-#from ...target import Target
-
-#templating
-
-FUNC_CALL_PARAM_TEMPLATE = jinja2.Template("(void *)({{name}})")
-
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-using {{name}} = {{ config_name }};
-"""
-)
-
-ARGS_PARSE_TEMPLATE = jinja2.Template(
-    """
-{% for idx in range(rank) %}
-  const int64_t in_{{idx}} = std::stoi(argv[{{ idx + 1 }}]);
-{% endfor %}
-"""
-)
-
-STRUCTS_DEF_TEMPLATE = jinja2.Template(
-    """
-
-struct ProfilerMemoryPool {
-  ProfilerMemoryPool() {
-    std::random_device rd;
-    gen = std::mt19937(rd());
-    uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
-    offsets.reserve(512);
-    strides.reserve(512);
-    copies.reserve(512);
-    ptrs.reserve(512);
-  }
-  ~ProfilerMemoryPool() {
-    for(int i = 0; i < ptrs.size(); i++){
-      hipFree(ptrs[i]);
-    }
-  }
-
-  template <typename DType>
-  DType* AllocateGaussianTensor(int64_t size) {
-    size_t length = size * sizeof(DType);
-    DType *d_x;
-    hipMalloc(&d_x, length);
-
-    float mean = 0.0f;
-    float stddev = 1.0f;
-    uint64_t seed = uniform_dist(gen);
-    rocrand_set_seed(generator, seed);
-    rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
-    return d_x;
-  }
-
-  ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
-    return reinterpret_cast<ck::half_t*>(
-        AllocateGaussianTensor<ck::half_t>(size));
-  }
-
-  int AllocateHalfTensor(int64_t size, int64_t copy) {
-    offsets.push_back(0);
-    strides.push_back(size);
-    copies.push_back(copy);
-    auto ptr = AllocateHalfGaussianTensor(size * copy);
-    ptrs.push_back(reinterpret_cast<void*>(ptr));
-    return ptrs.size() - 1;
-  }
-
-  ck::half_t* RequestHalfTensorByIdx(int idx) {
-    auto copy = copies.at(idx);
-    auto offset = offsets.at(idx);
-    auto stride = strides.at(idx);
-    ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
-    ptr += offset;
-    offset += stride;
-    if (offset == copy * stride) {
-        offset = 0;
-    }
-    offsets[idx] = offset;
-    return ptr;
-  }
-  std::vector<int64_t> offsets;
-  std::vector<int64_t> strides;
-  std::vector<int64_t> copies;
-  std::vector<void*> ptrs;
-  std::mt19937 gen;
-  std::uniform_int_distribution<int64_t> uniform_dist;
-  rocrand_generator generator;
-};
-
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p) const
-{
-  hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const
-{
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
-struct KernelTimerImpl
-{
-  KernelTimerImpl() {
-    hipGetErrorString(hipEventCreate(&mStart));
-    hipGetErrorString(hipEventCreate(&mEnd));
-  }
-  ~KernelTimerImpl() {
-    hipGetErrorString(hipEventDestroy(mStart));
-    hipGetErrorString(hipEventDestroy(mEnd));
-  }
-  void Start() {
-    hipGetErrorString(hipDeviceSynchronize());
-    hipGetErrorString(hipEventRecord(mStart, nullptr));
-  }
-  void End() {
-    hipGetErrorString(hipEventRecord(mEnd, nullptr));
-    hipGetErrorString(hipEventSynchronize(mEnd));
-  }
-  float GetElapsedTime() const {
-    float time;
-    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
-    return time;
-  }
-  hipEvent_t mStart, mEnd;
-};
-// >>> hack end
-
-"""
-)
-
-FUNC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <random>
-#include <rocrand/rocrand.h>
-#include "include/ck/utility/print.hpp"
-#include "library/include/ck/library/utility/device_memory.hpp"
-#include "library/include/ck/library/utility/host_tensor.hpp"
-#include "library/include/ck/library/utility/host_tensor_generator.hpp"
-#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "include/ck/utility/reduction_operator.hpp"
-{{extra_headers}}
-
-{{extra_code}}
-
-{{instances_decl}}
-
-{{func_signature}}
-{
-{{shape_eval}}
-{{exec_paths}}
-}
-    """
-)
-
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}   {{input}},
-{{indent}}   {{output}},
-{% for name in input_dim_names %}
-{{indent}}    const_cast<int64_t *>(&{{name}}),
-{% endfor %}
-{{indent}}   stream
-{{indent}});
-    """
-)
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-{{op_func}}
-
-{{structs_def}}
-
-int main(int argc, char** argv) {
-  {{args_parse}}
-  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
-  hipStream_t stream = nullptr;
-  {{tensor_decl}}
-  // warmup
-  for(int i = 0; i < 3; ++i) {
-    {{func_call}}
-  }
-  // run
-  KernelTimerImpl timer;
-  timer.Start();
-  for(int i = 0; i < 5; ++i) {
-    {{func_call}}
-  }
-  timer.End();
-  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
-  std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
-}
-"""
-)
-
-# rendering (messy, need to modularize and organize)
-
-# def gen_profiler(
-#     shape_eval_template: jinja2.Template,
-#     exec_template: jinja2.Template,
-#     tensor_decl_template: jinja2.Template,
-#     extra_header_template: jinja2.Template,
-#     get_func_signature: Any,
-#     extra_code: str = "",
-#     func_call_template: jinja2.Template = FUNC_CALL_TEMPLATE,
-#     indent: str = "  ",
-# ) -> str:
-    # shape_eval_template: jinja2.Template
-    # exec_template: jinja2.Template
-    # tensor_decl_template: jinja2.Template
-#extra_header_template: jinja2.Template
-get_func_signature: Any
-extra_code: str = ""
-func_call_template: jinja2.Template = FUNC_CALL_TEMPLATE
-indent: str = "  "
-
-    # shape_eval = shape_eval_template.render(rank=2) #if shape_eval_template else ""
-    # exe_path = exec_template.render(instance="DeviceInstance",dtype="void",reduce_dims=1,rank=2,eps=eps,)
-
-instances = INSTANCE_TEMPLATE.render(
-            name="DeviceInstance", config_name= "ck::tensor_operation::device::DeviceLayernormImpl",)
-
-op_func = FUNC_TEMPLATE.render(
-            instances_decl=instances,
-            #func_signature=get_func_signature(func_attrs),
-            #shape_eval=shape_eval,
-            #exec_paths=exe_path,
-            #extra_headers=extra_header_template.render(),
-            extra_code=extra_code,)
-
-structs_def = STRUCTS_DEF_TEMPLATE.render()
-args_parse = ARGS_PARSE_TEMPLATE.render(rank=2)
-#tensor_decl = tensor_decl_template.render(rank=2)
-
-input_dim_names = [f"in_{i}" for i in range(2)]
-func_call = func_call_template.render(
-            func_name="norm",
-            input="(void *) memory_pool->RequestHalfTensorByIdx(0)",
-            gamma="(void *) memory_pool->RequestHalfTensorByIdx(2)",
-            beta="(void *) memory_pool->RequestHalfTensorByIdx(3)",
-            output="(void *) memory_pool->RequestHalfTensorByIdx(1)",
-            input_dim_names=input_dim_names,
-            indent=indent,
-)
-
-code = PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            structs_def=structs_def,
-            args_parse=args_parse,
-            #tensor_decl=tensor_decl,
-            func_call=func_call,
-)
-
-# print(instances)
-# print(args_parse)
-# print(structs_def)
-#print(func_call)
-#print(op_func)
-print(code)
-
-
--- a/python/ait_impl/generation/permute_ex.py
+++ b/python/ait_impl/generation/permute_ex.py
-import jinja2
-
-EXTRA_SHAPE_TEMPLATE = jinja2.Template(
-    """
-{{indent}}const int64_t stride_a = *a_dim1;
-{{indent}}const int64_t stride_b = *b_dim1;
-{{indent}}const int64_t stride_c = *c_dim1;
-    ck::index_t M0 = M / G1 / G2;
-    ck::index_t M1 = G1;
-    ck::index_t M2 = G2;
-    ck::index_t N0 = G3;
-    ck::index_t N1 = N / G3;
-    // GEMM shape
-    //ck::index_t M = M0 * M1 * M2;
-    //ck::index_t N = N0 * N1;
-    //ck::index_t K = 128;
-    //ck::index_t stride_A = K;
-    //ck::index_t stride_B = K;
-    // E = [M0, N0, M1, N1, M2]
-    /* 0, 3, 1, 4, 2
-    ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
-    ck::index_t stride_E_M1 = N1 * M2;
-    ck::index_t stride_E_M2 = 1;
-    ck::index_t stride_E_N0 = M1 * N1 * M2;
-    ck::index_t stride_E_N1 = M2;
-    */
-    // E = [M2, M0, N0, M1, N1] 2, 0, 3, 1, 4
-    ck::index_t stride_E_M0 = N0* M1* N1;
-    ck::index_t stride_E_M1 = N1;
-    ck::index_t stride_E_M2 = M0* N0* M1* N1;
-    ck::index_t stride_E_N0 = M1 * N1;
-    ck::index_t stride_E_N1 = 1;
-    // D = [0, N0, 0, N1, 0]
-    ck::index_t stride_D_M0 = 0;
-    ck::index_t stride_D_M1 = 0;
-    ck::index_t stride_D_M2 = 0;
-    ck::index_t stride_D_N0 = N1;
-    ck::index_t stride_D_N1 = 1;
-"""
-)
-
-output = EXTRA_SHAPE_TEMPLATE.render(indent=" ");
-print (output)
\ No newline at end of file
--- a/python/ait_impl/generation/xx.cpp
+++ b/python/ait_impl/generation/xx.cpp