added compilation of shared library and multiple instances for gemm, cleaned up code design

47cc9b7e · Astha Rai · adbefd90 · 47cc9b7e · adbefd90 · adbefd90
Commit 47cc9b7e authored May 26, 2023 by Astha Rai
20 changed files
--- a/python/README.md
+++ b/python/README.md
+# CK Python API
+This API uses Python to generate instances of operations present in CK, compiles them into a shared library, and an executable to run the instances.
+There are 2 directories: shared and normal. The normal directory contains one instance that will compile into an excutable to be run, while the shared directory 
+generates multiple instances and compiles them into a shared library.
+## Normal
+## Shared
--- a/python/ait_impl/generation/Makefile
+++ b/python/ait_impl/generation/Makefile
-gemm: xx.o
-CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
-CXXFLAGS = -std=c++17
-xx.o: 
-	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  xx.cpp
--- a/python/ait_impl/generation/Makefile2
+++ b/python/ait_impl/generation/Makefile2
-CC = {{cc}}
-CFLAGS = {{CFLAGS}}
-fPIC_flag = {{fPIC}}
-obj_files = {{obj_files}}
-%.obj : %.{{cpp}}
-    {{cfile_cmd}}
-%.obj : %.bin
-    {{bfile_cmd}}
-.PHONY: all clean clean_constants
-all: {{target}}
-{{target}}: $(obj_files)
-    $(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)
-clean:
-    rm -f *.obj {{target}} test.so
-clean_constants:
-    rm -f constants.bin
\ No newline at end of file
--- a/python/ait_impl/generation/ex/normal/gemm
+++ b/python/ait_impl/generation/ex/normal/gemm
--- a/python/ait_impl/generation/ex/normal/gemm_ex.py
+++ b/python/ait_impl/generation/ex/normal/gemm_ex.py
-import enum
-import os.path
-import shutil
-import functools
-import operator
-import collections
-import subprocess
-import re
-def SubstituteTemplate(template, values):
-    text = template
-    changed = True
-    while changed:
-        changed = False
-        for key, value in values.items():
-            regex = "\\$\\{%s\\}" % key
-            newtext = re.sub(regex, value, text)
-            if newtext != text:
-                changed = True
-            text = newtext
-    return text
-class EmitGemmInstance:
-    def __init__(self):
-        self.make_template =     """
-CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
-CXXFLAGS = -std=c++17
-gemm: ex.o host_tensor.o device_memory.o
-	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
-device_memory.o: ../../../../library/src/utility/device_memory.cpp
-	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
-host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
-	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
-ex.o: 
-	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
-    """
-        self.gemm_devop_template =     """
-#pragma once
-#include "common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
-using ADataType   = ck::half_t;
-using BDataType   = ck::half_t;
-using CDataType   = ck::half_t;
-using AccDataType = float;
-using ALayout = Col;
-using BLayout = Row;
-using CLayout = Row;
-using AElementOp = PassThrough;
-using BElementOp = PassThrough;
-using CElementOp = PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
-            ${type_a},
-            ${type_b},
-            ${type_c},
-            ${type_acc},
-            ${layout_a},
-            ${layout_b},
-            ${layout_c},
-            ${elementwise_op_a},
-            ${elementwise_op_b},
-            ${elementwise_op_c},
-            ${Gemm_spec},
-            ${block_size},
-            ${mperblock},
-            ${nperblock},
-            ${k0perblock},
-            ${k1},
-            ${m1perthread},
-            ${n1perthread},
-            ${kperthread},
-            ${m1n1_thcluster_m1xs},
-            ${m1n1_thcluster_n1xs},
-            ${ABT_thread_slice_lengths_K0_M0_M1_K1},
-            ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
-            ${ABT_thread_cluster_arrange_order},
-            ${ABT_src_access_order},
-            ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
-            ${ABT_src_vec_tensor_cont_dim_order},
-            ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
-            ${BBT_thread_slice_lengths_K0_N0_N1_K1},
-            ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
-            ${BBT_thread_cluster_arrange_order},
-            ${BBT_src_access_order},
-            ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
-            ${BBT_src_vec_tensor_cont_dim_order},
-            ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
-            ${CTT_src_dst_access_order},
-            ${CTT_src_dst_vec_dim},
-            ${CTT_dst_scalar_per_vector}>;
-    using ReferenceGemmInstance = ck::tensor_operation::host::
-        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
-bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
-{
-    using namespace ck::literals;
-    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-    Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
-    Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
-    switch(config.init_method)
-    {
-    case 0: break;
-    case 1:
-        ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
-        break;
-    default:
-        ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
-    }
-    Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-    DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    auto a_element_op = ${elementwise_op_a}{};
-    auto b_element_op = ${elementwise_op_b}{};
-    auto c_element_op = ${elementwise_op_c}{};
-    // do GEMM
-    auto gemm     = DeviceGemmInstance{};
-    auto invoker  = gemm.MakeInvoker();
-    auto argument = gemm.MakeArgument(
-        static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
-        static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
-        static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
-        M,
-        N,
-        K,
-        StrideA,
-        StrideB,
-        StrideC,
-        a_element_op,
-        b_element_op,
-        c_element_op);
-    if(!gemm.IsSupportedArgument(argument))
-    {
-        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
-        return true;
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
-    std::size_t flop = 2_uz * M * N * K;
-    std::size_t num_btype =
-        sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << gemm.GetTypeString() << std::endl;
-    if(config.do_verification)
-    {
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
-        ref_invoker.Run(ref_argument);
-        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-    }
-    return true;
-}
-bool run_gemm_example(int argc, char* argv[])
-{
-    ProblemSize problem_size;
-    ExecutionConfig config;
-    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
-}
-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
-"""
-    def emit(self):
-        values = {
-            'type_a' : 'ck::half_t',
-            'type_b' : 'ck::half_t',
-            'type_c' : 'ck::half_t',
-            'type_acc' : 'float',
-            'layout_a' : 'ck::tensor_layout::gemm::ColumnMajor',
-            'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
-            'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
-            'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
-            'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
-            'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
-            'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::Default',
-            'block_size' : '256',
-            'mperblock' : '128',
-            'nperblock' : '128',
-            'k0perblock' : '16',
-            'k1' : '2',
-            'm1perthread' : '4',
-            'n1perthread' : '4',
-            'kperthread' : '1',
-            'm1n1_thcluster_m1xs' : 'S<8, 2>',
-            'm1n1_thcluster_n1xs' : 'S<8, 2>',
-            'ABT_thread_slice_lengths_K0_M0_M1_K1' : 'S<2, 1, 4, 2>',
-            'ABT_thread_cluster_lengths_K0_M0_M1_K1' : 'S<8, 1,  32, 1>',
-            'ABT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
-            'ABT_src_access_order' : 'S<0, 3, 1, 2>',
-            'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 1>',
-            'ABT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
-            'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 2>',
-            'BBT_thread_slice_lengths_K0_N0_N1_K1' : 'S<2, 1, 4, 2>',
-            'BBT_thread_cluster_lengths_K0_N0_N1_K1' : 'S<8, 1, 32, 1>',
-            'BBT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
-            'BBT_src_access_order' : 'S<0, 3, 1, 2>',
-            'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : 'S<1, 1, 4, 1>',
-            'BBT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
-            'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': 'S<1, 1, 4, 2>',
-            'CTT_src_dst_access_order' : 'S<0, 1, 2, 3, 4, 5>',
-            'CTT_src_dst_vec_dim' : '5',
-            'CTT_dst_scalar_per_vector' : '4'
-        }
-        template = self.gemm_devop_template
-        cf = open("ex.cpp", 'w')
-        print(SubstituteTemplate(template, values))
-        cf.write(SubstituteTemplate(template, values))
-        cf.close()
-        m_template = self.make_template
-        cf = open("Makefile", 'w')
-        print(SubstituteTemplate(m_template, values))
-        cf.write(SubstituteTemplate(m_template, values))
-        cf.close()
-        PIPE = -1
-        STDOUT = -2
-        proc = subprocess.Popen(
-        ["make"],
-        shell=True,
-        env=os.environ.copy(),
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        )
-        out, err = proc.communicate()
-a = EmitGemmInstance()
-a.emit()
--- a/python/ait_impl/generation/ex/normal/gemm_ex_code.py
+++ b/python/ait_impl/generation/ex/normal/gemm_ex_code.py
-import enum
-import os.path
-import shutil
-import functools
-import operator
-import collections
-import re
-def SubstituteTemplate(template, values):
-    text = template
-    changed = True
-    while changed:
-        changed = False
-        for key, value in values.items():
-            regex = "\\$\\{%s\\}" % key
-            newtext = re.sub(regex, value, text)
-            if newtext != text:
-                changed = True
-            text = newtext
-    return text
-class EmitGemmInstance:
-    def __init__(self):
-        self.gemm_devop_template =     """
-#pragma once
-#include "common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
-using ADataType   = ck::half_t;
-using BDataType   = ck::half_t;
-using CDataType   = ck::half_t;
-using AccDataType = float;
-using ALayout = Col;
-using BLayout = Row;
-using CLayout = Row;
-using AElementOp = PassThrough;
-using BElementOp = PassThrough;
-using CElementOp = PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
-         <ADataType, 
-         BDataType, 
-         CDataType, 
-         AccDataType, 
-         ALayout, 
-         BLayout, 
-         CLayout,  
-         AElementOp,  
-         BElementOp,  
-         CElementOp,    
-         GemmDefault,   
-         256,   
-         128,   
-         128,    
-         16,  
-         2,          
-         4,          
-         4,      
-         1,       
-         S<8, 2>,       
-         S<8, 2>,      
-         S<2, 1, 4, 2>,      
-         S<8, 1,  32, 1>,  
-         S<0, 3, 1, 2>,  
-         S<0, 3, 1, 2>,       
-         S<1, 1, 4, 1>,      
-         S<0, 3, 1, 2>,       
-         S<1, 1, 4, 2>,      
-         S<2, 1, 4, 2>,       
-         S<8, 1, 32, 1>,  
-         S<0, 3, 1, 2>,  
-         S<0, 3, 1, 2>,       
-         S<1, 1, 4, 1>,      
-         S<0, 3, 1, 2>,       
-         S<1, 1, 4, 2>, 
-         S<0, 1, 2, 3, 4, 5>,               
-         5,                  
-         4>;
-bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
-{
-    using namespace ck::literals;
-    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
-    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    switch(config.init_method)
-    {
-    case 0: break;
-    case 1:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
-        break;
-    default:
-        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
-    }
-    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    auto a_element_op = AElementOp{};
-    auto b_element_op = BElementOp{};
-    auto c_element_op = CElementOp{};
-    // do GEMM
-    auto gemm     = DeviceGemmInstance{};
-    auto invoker  = gemm.MakeInvoker();
-    auto argument = gemm.MakeArgument(
-        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
-        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
-        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
-        M,
-        N,
-        K,
-        StrideA,
-        StrideB,
-        StrideC,
-        a_element_op,
-        b_element_op,
-        c_element_op);
-    if(!gemm.IsSupportedArgument(argument))
-    {
-        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
-        return true;
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
-    std::size_t flop = 2_uz * M * N * K;
-    std::size_t num_btype =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << gemm.GetTypeString() << std::endl;
-    if(config.do_verification)
-    {
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
-        ref_invoker.Run(ref_argument);
-        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-    }
-    return true;
-}
-bool run_gemm_example(int argc, char* argv[])
-{
-    ProblemSize problem_size;
-    ExecutionConfig config;
-    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
-}
-"""
-    def emit(self):
-        values = {
-            'type_a' : 'ck::half_t',
-        }
-        template = self.gemm_devop_template
-        cf = open("xx.cpp", 'w')
-        print(SubstituteTemplate(template, values))
-        cf.write(SubstituteTemplate(template, values))
-        cf.close()
-a = EmitGemmInstance()
-a.emit()
--- a/python/ait_impl/generation/ex/potential/Makefile
+++ b/python/ait_impl/generation/ex/potential/Makefile
-CC = /opt/rocm/bin/hipcc
-CK_PATH=/dockerx/composable_kernel/
-CFLAGS = -O3 -std=c++17 -DCK_AMD_GPU_GFX90A --offload-arch=gfx90a -I"${CK_PATH}/include" -I"${CK_PATH}/library/include" -I"${CK_PATH}/profiler/include"
-OBJS = ex.o host_tensor.o device_memory.o
-all: $(OBJS)
-	$(CC) $(CFLAGS) $(OBJS) -o ex
-device_memory.o: ../../library/src/utility/device_memory.cpp
-	$(CC) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
-host_tensor.o: ../../library/src/utility/host_tensor.cpp
-	$(CC) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp
\ No newline at end of file
--- a/python/ait_impl/generation/ex/shared/__pycache__/user.cpython-38.pyc
+++ b/python/ait_impl/generation/ex/shared/__pycache__/user.cpython-38.pyc
--- a/python/ait_impl/generation/ex/shared/gemm_ex_code.py
+++ b/python/ait_impl/generation/ex/shared/gemm_ex_code.py
-import enum
-import os.path
-import shutil
-import functools
-import operator
-import collections
-import re
-def SubstituteTemplate(template, values):
-    text = template
-    changed = True
-    while changed:
-        changed = False
-        for key, value in values.items():
-            regex = "\\$\\{%s\\}" % key
-            newtext = re.sub(regex, value, text)
-            if newtext != text:
-                changed = True
-            text = newtext
-    return text
-class EmitGemmInstance:
-    def __init__(self):
-        self.gemm_devop_template =     """
-#pragma once
-#include "common.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
-using ADataType   = ck::half_t;
-using BDataType   = ck::half_t;
-using CDataType   = ck::half_t;
-using AccDataType = float;
-using ALayout = Col;
-using BLayout = Row;
-using CLayout = Row;
-using AElementOp = PassThrough;
-using BElementOp = PassThrough;
-using CElementOp = PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
-         <ADataType, 
-         BDataType, 
-         CDataType, 
-         AccDataType, 
-         ALayout, 
-         BLayout, 
-         CLayout,  
-         AElementOp,  
-         BElementOp,  
-         CElementOp,    
-         GemmDefault,   
-         256,   
-         128,   
-         128,    
-         16,  
-         2,          
-         4,          
-         4,      
-         1,       
-         S<8, 2>,       
-         S<8, 2>,      
-         S<2, 1, 4, 2>,      
-         S<8, 1,  32, 1>,  
-         S<0, 3, 1, 2>,  
-         S<0, 3, 1, 2>,       
-         S<1, 1, 4, 1>,      
-         S<0, 3, 1, 2>,       
-         S<1, 1, 4, 2>,      
-         S<2, 1, 4, 2>,       
-         S<8, 1, 32, 1>,  
-         S<0, 3, 1, 2>,  
-         S<0, 3, 1, 2>,       
-         S<1, 1, 4, 1>,      
-         S<0, 3, 1, 2>,       
-         S<1, 1, 4, 2>, 
-         S<0, 1, 2, 3, 4, 5>,               
-         5,                  
-         4>;
-bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
-{
-    using namespace ck::literals;
-    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
-    auto f_host_tensor_descriptor =
-        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
-            {
-                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-            }
-            else
-            {
-                return HostTensorDescriptor({row, col}, {1_uz, stride});
-            }
-        };
-    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
-    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
-    switch(config.init_method)
-    {
-    case 0: break;
-    case 1:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
-        break;
-    default:
-        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
-    }
-    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
-    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
-    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
-    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
-    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
-    auto a_element_op = AElementOp{};
-    auto b_element_op = BElementOp{};
-    auto c_element_op = CElementOp{};
-    // do GEMM
-    auto gemm     = DeviceGemmInstance{};
-    auto invoker  = gemm.MakeInvoker();
-    auto argument = gemm.MakeArgument(
-        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
-        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
-        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
-        M,
-        N,
-        K,
-        StrideA,
-        StrideB,
-        StrideC,
-        a_element_op,
-        b_element_op,
-        c_element_op);
-    if(!gemm.IsSupportedArgument(argument))
-    {
-        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
-        return true;
-    }
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
-    std::size_t flop = 2_uz * M * N * K;
-    std::size_t num_btype =
-        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << gemm.GetTypeString() << std::endl;
-    if(config.do_verification)
-    {
-        auto ref_gemm    = ReferenceGemmInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
-        ref_invoker.Run(ref_argument);
-        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-    }
-    return true;
-}
-bool run_gemm_example(int argc, char* argv[])
-{
-    ProblemSize problem_size;
-    ExecutionConfig config;
-    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
-}
-"""
-    def emit(self):
-        values = {
-            'type_a' : 'ck::half_t',
-        }
-        template = self.gemm_devop_template
-        cf = open("xx.cpp", 'w')
-        print(SubstituteTemplate(template, values))
-        cf.write(SubstituteTemplate(template, values))
-        cf.close()
-a = EmitGemmInstance()
-a.emit()
--- a/python/ait_impl/generation/ex/shared/input.py
+++ b/python/ait_impl/generation/ex/shared/input.py
-#take in input for gemm from user, send it to example template
\ No newline at end of file
--- a/python/ait_impl/samples/ck/device_gemm_dl_ck.hpp
+++ b/python/ait_impl/samples/ck/device_gemm_dl_ck.hpp
--- a/python/ait_impl/samples/ck/gridwise_gemm_dl_v1r3_ck.hpp
+++ b/python/ait_impl/samples/ck/gridwise_gemm_dl_v1r3_ck.hpp
--- a/python/ait_impl/samples/elementwise_common.py
+++ b/python/ait_impl/samples/elementwise_common.py
--- a/python/ait_impl/samples/gemm_common.py
+++ b/python/ait_impl/samples/gemm_common.py
--- a/python/ait_impl/samples/gemm_rrr.cpp
+++ b/python/ait_impl/samples/gemm_rrr.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-// #include <half.hpp>
-#include <random>
-#include <rocrand/rocrand.h>
-#include "logging.h"
-#include "include/ck/utility/print.hpp"
-#include "library/include/ck/library/utility/device_memory.hpp"
-#include "library/include/ck/library/utility/host_tensor.hpp"
-#include "library/include/ck/library/utility/host_tensor_generator.hpp"
-#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "include/ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
-using gemm_2_hhh_TTT_256_64_128_32_8_2_32_32_1_2_PT = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle<
-    ck::tensor_layout::gemm::RowMajor,
-    ck::tensor_layout::gemm::RowMajor,
-    ck::tensor_layout::gemm::RowMajor,
-    ck::half_t,
-    ck::half_t,
-    ck::half_t,
-    float,
-    ck::half_t,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::device::GemmSpecialization::MNKPadding,
-    1,
-   256, // block_size
-   64, // m_per_block
-   128, // n_per_block
-   32, // k_per_block
-   8, // ak1
-   2, // bk1
-   32, // m_per_xdl
-   32, // n_per_xdl
-   1, // m_xdl_per_wave
-   2, // n_xdl_per_wave
-    ck::Sequence<4,64,1>, // thread_cluster_length
-    ck::Sequence<1,0,2>, // thread_cluster_arrange_order
-    ck::Sequence<1,0,2>, // src_access_order
-    2, // src_vector_dim
-    8, // src_scalar_per_vector
-    8, // dst_scalar_per_vector
-    1, // add_extra_dim
-    ck::Sequence<8,32,1>, // thread_cluster_length
-    ck::Sequence<0,2,1>, // thread_cluster_arrange_order
-    ck::Sequence<0,2,1>, // src_access_order
-    1, // src_vector_dim
-    4, // src_scalar_per_vector
-    2, // dst_scalar_per_vector
-    0, // add_extra_dim
-    1, // m_xdl_per_wave
-    1, // n_xdl_per_wave
-    ck::Sequence<1,32,1,8>, // m_n_block_wave_per_xdl
-    8 // scalar_per_vector
-    >;
-using fe7d3cbb34ba481ca532c1fecec7ec7cc5fbb35d0 = gemm_2_hhh_TTT_256_64_128_32_8_2_32_32_1_2_PT;
-void gemm_rrr_3(
-    void * in_ptr,
-    void * weight_ptr,
-    void * out_ptr,
-    int64_t* a_dim0,
-    int64_t* a_dim1,
-    int64_t* b_dim0,
-    int64_t* b_dim1,
-    int64_t* c_dim0,
-    int64_t* c_dim1,
-    hipStream_t stream
-    ) {
- ck::index_t M = (*a_dim0);
- ck::index_t N = (*b_dim1);
- ck::index_t K = (*a_dim1);
-  int64_t offset_a = 0;
-  int64_t offset_b = 0;
-  int64_t offset_c = 0;
-  ck::index_t stride_a = *a_dim1;
-  ck::index_t stride_b = *b_dim1;
-  ck::index_t stride_c = *c_dim1;
-  if (M == 256 && N == 32 && K == 128) {
-    auto op =  fe7d3cbb34ba481ca532c1fecec7ec7cc5fbb35d0{};
-    auto invoker  = op.MakeInvoker();
-    auto argument = op.MakeArgument(
-                                    static_cast<ck::half_t *>(in_ptr) + offset_a,
-                                    static_cast<ck::half_t *>(weight_ptr) + offset_b,
-                                    static_cast<ck::half_t *>(out_ptr) + offset_c,
-                                    M,
-                                    N,
-                                    K,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    ck::tensor_operation::element_wise::PassThrough{},
-                                    ck::tensor_operation::element_wise::PassThrough{},
-                                    ck::tensor_operation::element_wise::PassThrough{}
-    );
-    if(!op.IsSupportedArgument(argument)) {
-      LOG(FATAL) << "wrong! " << op.GetTypeString() << " with the specified compilation parameters does not support this Gemm problem.";
-    }
-    invoker.Run(argument, StreamConfig{stream, false});
-    return;
-  }
-  LOG(FATAL) << "Unsupported workload for this gemm specialization.";
-}
\ No newline at end of file
--- a/python/ait_impl/samples/gemm_rrr_3.cpp
+++ b/python/ait_impl/samples/gemm_rrr_3.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-// #include <half.hpp>
-#include <random>
-#include <rocrand/rocrand.h>
-#include "logging.h"
-#include "include/ck/utility/print.hpp"
-#include "library/include/ck/library/utility/device_memory.hpp"
-#include "library/include/ck/library/utility/host_tensor.hpp"
-#include "library/include/ck/library/utility/host_tensor_generator.hpp"
-#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "include/ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
-using gemm_2_hhh_TTT_256_64_128_32_8_2_32_32_1_2_PT = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle<
-    ck::tensor_layout::gemm::RowMajor,
-    ck::tensor_layout::gemm::RowMajor,
-    ck::tensor_layout::gemm::RowMajor,
-    ck::half_t,
-    ck::half_t,
-    ck::half_t,
-    float,
-    ck::half_t,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::device::GemmSpecialization::MNKPadding,
-    1,
-   256, // block_size
-   64, // m_per_block
-   128, // n_per_block
-   32, // k_per_block
-   8, // ak1
-   2, // bk1
-   32, // m_per_xdl
-   32, // n_per_xdl
-   1, // m_xdl_per_wave
-   2, // n_xdl_per_wave
-    ck::Sequence<4,64,1>, // thread_cluster_length
-    ck::Sequence<1,0,2>, // thread_cluster_arrange_order
-    ck::Sequence<1,0,2>, // src_access_order
-    2, // src_vector_dim
-    8, // src_scalar_per_vector
-    8, // dst_scalar_per_vector
-    1, // add_extra_dim
-    ck::Sequence<8,32,1>, // thread_cluster_length
-    ck::Sequence<0,2,1>, // thread_cluster_arrange_order
-    ck::Sequence<0,2,1>, // src_access_order
-    1, // src_vector_dim
-    4, // src_scalar_per_vector
-    2, // dst_scalar_per_vector
-    0, // add_extra_dim
-    1, // m_xdl_per_wave
-    1, // n_xdl_per_wave
-    ck::Sequence<1,32,1,8>, // m_n_block_wave_per_xdl
-    8 // scalar_per_vector
-    >;
-using fe7d3cbb34ba481ca532c1fecec7ec7cc5fbb35d0 = gemm_2_hhh_TTT_256_64_128_32_8_2_32_32_1_2_PT;
-void gemm_rrr_3(
-    void * in_ptr,
-    void * weight_ptr,
-    void * out_ptr,
-    int64_t* a_dim0,
-    int64_t* a_dim1,
-    int64_t* b_dim0,
-    int64_t* b_dim1,
-    int64_t* c_dim0,
-    int64_t* c_dim1,
-    hipStream_t stream
-    ) {
- ck::index_t M = (*a_dim0);
- ck::index_t N = (*b_dim1);
- ck::index_t K = (*a_dim1);
-  int64_t offset_a = 0;
-  int64_t offset_b = 0;
-  int64_t offset_c = 0;
-  ck::index_t stride_a = *a_dim1;
-  ck::index_t stride_b = *b_dim1;
-  ck::index_t stride_c = *c_dim1;
-  if (M == 256 && N == 32 && K == 128) {
-    auto op =  fe7d3cbb34ba481ca532c1fecec7ec7cc5fbb35d0{};
-    auto invoker  = op.MakeInvoker();
-    auto argument = op.MakeArgument(
-                                    static_cast<ck::half_t *>(in_ptr) + offset_a,
-                                    static_cast<ck::half_t *>(weight_ptr) + offset_b,
-                                    static_cast<ck::half_t *>(out_ptr) + offset_c,
-                                    M,
-                                    N,
-                                    K,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    ck::tensor_operation::element_wise::PassThrough{},
-                                    ck::tensor_operation::element_wise::PassThrough{},
-                                    ck::tensor_operation::element_wise::PassThrough{}
-    );
-    if(!op.IsSupportedArgument(argument)) {
-      LOG(FATAL) << "wrong! " << op.GetTypeString() << " with the specified compilation parameters does not support this Gemm problem.";
-    }
-    invoker.Run(argument, StreamConfig{stream, false});
-    return;
-  }
-  LOG(FATAL) << "Unsupported workload for this gemm specialization.";
-}
\ No newline at end of file
--- a/python/ait_impl/samples/layernorm.cpp
+++ b/python/ait_impl/samples/layernorm.cpp
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <random>
-#include <rocrand/rocrand.h>
-#include "include/ck/utility/print.hpp"
-#include "library/include/ck/library/utility/device_memory.hpp"
-#include "library/include/ck/library/utility/host_tensor.hpp"
-#include "library/include/ck/library/utility/host_tensor_generator.hpp"
-#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "include/ck/utility/reduction_operator.hpp"
-#include "include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
-using layernorm_rank2_256_1_256_1_8_1_8_1_8_1_8_8 = ck::tensor_operation::device::DeviceLayernormImpl<
-    ck::half_t,
-    ck::half_t,
-    ck::half_t,
-    float,
-    ck::half_t,
-    ck::tensor_operation::element_wise::PassThrough,
-    2,
-    1,
-    256, // block_size
-    1, // m_cluster_size
-    256, // k_cluster_size
-    1, // m_slice_size
-    8, // k_slice_size
-    1, // in_src_dim
-    8, // in_src_size
-    1, // gamma_src_dim
-    8, // gamma_src_size
-    1, // beta_src_dim
-    8, // beta_src_size
-    8 // out_dst_size
-    >;
-using DeviceInstance = layernorm_rank2_256_1_256_1_8_1_8_1_8_1_8_8;
-void layernorm_0(void* input,
-                   void* gamma,
-                   void* beta,
-                   void* output,
-                   int64_t* in_0,
-                   int64_t* in_1,
-                   hipStream_t stream)
-{
-    int M = *in_0;
-    int N = *in_1;
-    std::vector<ck::index_t> i_inStrides;
-    i_inStrides.push_back(N);
-    i_inStrides.push_back(1);
-    auto device_instance = DeviceInstance{};
-    auto argument_ptr = device_instance.MakeArgumentPointer(
-        {M, N},
-        i_inStrides,
-        std::vector<ck::index_t>{0, 1},
-        std::vector<ck::index_t>{0, 1},
-        i_inStrides,
-        {1},
-        1e-05,
-        static_cast<ck::half_t *>(input),
-        static_cast<ck::half_t *>(gamma),
-        static_cast<ck::half_t *>(beta),
-        static_cast<ck::half_t *>(output),
-        ck::tensor_operation::element_wise::PassThrough{}
-    );
-    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
-    {
-        throw std::runtime_error(
-            "wrong! device_layernorm with the specified compilation parameters does "
-            "not support this Softmax problem");
-    };
-    std::string instance_name = device_instance.GetTypeString();
-    auto invoker_ptr = device_instance.MakeInvokerPointer();
-    invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
-    return;
-}
-struct ProfilerMemoryPool {
-  ProfilerMemoryPool() {
-    std::random_device rd;
-    gen = std::mt19937(rd());
-    uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
-    offsets.reserve(512);
-    strides.reserve(512);
-    copies.reserve(512);
-    ptrs.reserve(512);
-  }
-  ~ProfilerMemoryPool() {
-    for(int i = 0; i < ptrs.size(); i++){
-      hipFree(ptrs[i]);
-    }
-  }
-  template <typename DType>
-  DType* AllocateGaussianTensor(int64_t size) {
-    size_t length = size * sizeof(DType);
-    DType *d_x;
-    hipMalloc(&d_x, length);
-    float mean = 0.0f;
-    float stddev = 1.0f;
-    uint64_t seed = uniform_dist(gen);
-    rocrand_set_seed(generator, seed);
-    rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
-    return d_x;
-  }
-  ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
-    return reinterpret_cast<ck::half_t*>(
-        AllocateGaussianTensor<ck::half_t>(size));
-  }
-  int AllocateHalfTensor(int64_t size, int64_t copy) {
-    offsets.push_back(0);
-    strides.push_back(size);
-    copies.push_back(copy);
-    auto ptr = AllocateHalfGaussianTensor(size * copy);
-    ptrs.push_back(reinterpret_cast<void*>(ptr));
-    return ptrs.size() - 1;
-  }
-  ck::half_t* RequestHalfTensorByIdx(int idx) {
-    auto copy = copies.at(idx);
-    auto offset = offsets.at(idx);
-    auto stride = strides.at(idx);
-    ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
-    ptr += offset;
-    offset += stride;
-    if (offset == copy * stride) {
-        offset = 0;
-    }
-    offsets[idx] = offset;
-    return ptr;
-  }
-  std::vector<int64_t> offsets;
-  std::vector<int64_t> strides;
-  std::vector<int64_t> copies;
-  std::vector<void*> ptrs;
-  std::mt19937 gen;
-  std::uniform_int_distribution<int64_t> uniform_dist;
-  rocrand_generator generator;
-};
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p) const
-{
-  hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const
-{
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
-struct KernelTimerImpl
-{
-  KernelTimerImpl() {
-    hipGetErrorString(hipEventCreate(&mStart));
-    hipGetErrorString(hipEventCreate(&mEnd));
-  }
-  ~KernelTimerImpl() {
-    hipGetErrorString(hipEventDestroy(mStart));
-    hipGetErrorString(hipEventDestroy(mEnd));
-  }
-  void Start() {
-    hipGetErrorString(hipDeviceSynchronize());
-    hipGetErrorString(hipEventRecord(mStart, nullptr));
-  }
-  void End() {
-    hipGetErrorString(hipEventRecord(mEnd, nullptr));
-    hipGetErrorString(hipEventSynchronize(mEnd));
-  }
-  float GetElapsedTime() const {
-    float time;
-    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
-    return time;
-  }
-  hipEvent_t mStart, mEnd;
-};
-// >>> hack end
-int main(int argc, char** argv) {
-  const int64_t in_0 = std::stoi(argv[1]);
-  const int64_t in_1 = std::stoi(argv[2]);
-  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
-  hipStream_t stream = nullptr;
-  int64_t ptr_sz = in_0 * in_1;
-  int64_t norm_dim = in_1;
-  // TODO: special pool size for 8M L2 cache
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
-  memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
-  memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
-  memory_pool->AllocateHalfTensor(norm_dim, mem_pool_sz);  // gamma: index 2
-  memory_pool->AllocateHalfTensor(norm_dim, mem_pool_sz);  // beta: index 3
-  // warmup
-  for(int i = 0; i < 3; ++i) {
-  layernorm_0(
-     (void *) memory_pool->RequestHalfTensorByIdx(0),
-     (void *) memory_pool->RequestHalfTensorByIdx(2),
-     (void *) memory_pool->RequestHalfTensorByIdx(3),
-     (void *) memory_pool->RequestHalfTensorByIdx(1),
-      const_cast<int64_t *>(&in_0),
-      const_cast<int64_t *>(&in_1),
-     stream
-  );
-  }
-  // run
-  KernelTimerImpl timer;
-  timer.Start();
-  for(int i = 0; i < 5; ++i) {
-  layernorm_0(
-     (void *) memory_pool->RequestHalfTensorByIdx(0),
-     (void *) memory_pool->RequestHalfTensorByIdx(2),
-     (void *) memory_pool->RequestHalfTensorByIdx(3),
-     (void *) memory_pool->RequestHalfTensorByIdx(1),
-      const_cast<int64_t *>(&in_0),
-      const_cast<int64_t *>(&in_1),
-     stream
-  );
-  }
-  timer.End();
-  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
-  std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
-}
\ No newline at end of file
--- a/python/ait_impl/samples/model-generate.h
+++ b/python/ait_impl/samples/model-generate.h
-#pragma once
-#include "logging.h"
-#include "device_functions-generated.h"
-#include "model_interface.h"
-#include "raii_wrapper.h"
-#include "macros.h"
-#include <algorithm>
-#include <deque>
-#include <string>
-#include <unordered_map>
-#include <math.h>
-void gemm_rrr_3(
-  void *,
-  void *,
-  void *,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  hipStream_t
-);
-#define CHECK_VECTOR_ACCESS(vector, idx)                                  \
-  if (idx >= vector.size()) {                                             \
-    throw std::out_of_range(                                              \
-        "[__func__]: index out of range, " #vector ".size()=" +           \
-        std::to_string(vector.size()) + ", got " + std::to_string(idx));  \
-  }
-namespace ait {
-namespace {
-void DeviceCheckLastError(const char* file, int line) {
-  auto device_error = GetLastError();
-  if (device_error != GetDeviceSuccess()) {
-    std::string msg = std::string("Got error: ") + GetLastErrorString() +
-                      " enum: " + std::to_string(device_error) +
-                      " at " + file + ": " + std::to_string(line);
-    LOG(ERROR) << msg;
-    throw std::runtime_error(msg);
-  }
-}
-thread_local bool target_has_graph_mode = false;
-} // namespace
-// Model is the class that actually performs inference. It owns memory for
-// intermediate tensors and dynamic dimensions. Constants are owned by
-// the model's owning container object, and input/output memory is owned
-// by the user.
-// Once an inference run has started, it is not safe to re-use the Model
-// until the run has finished!
-class Model {
-  public:
-  Model(
-      size_t blob_size,
-      size_t workspace_size,
-      size_t num_inputs,
-      size_t num_outputs,
-      size_t num_unbound_constants,
-      uint8_t* constants,
-      AITemplateAllocator& allocator)
-      : blob_(RAII_DeviceMalloc(blob_size, allocator)),
-        workspace_(RAII_DeviceMalloc(workspace_size, allocator)),
-        params_(num_inputs + num_outputs + num_unbound_constants),
-        num_inputs_(num_inputs),
-        num_outputs_(num_outputs),
-        constants_(constants) {
-      dmlc::InitLogging("aitemplate"); // TODO(xxx): render network name
-      LOG(INFO) << "Init AITemplate Runtime.";
-      global_workspace_ = static_cast<uint8_t*>(workspace_.get()) + 0;
-      unique_workspace_ = static_cast<uint8_t*>(workspace_.get());
-      DEVICE_CHECK(GetDevice(&device_idx_))
-      DEVICE_CHECK(CreateEvent(&run_finished_));
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-      DEVICE_CHECK(cudaDeviceGetAttribute(
-        &max_smem_size_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx_));
-#endif
-      DEVICE_CHECK(GetDeviceProperties(&device_properties_, device_idx_));
-      DEVICE_CHECK(StreamCreate(&graph_capture_stream_, /*non_blocking=*/true));
-  InitConstants(constants_);
-      auto* blob_ptr = static_cast<uint8_t*>(blob_.get());
-    }
-    ~Model() {
-      if (run_finished_ != nullptr) {
-        DestroyEvent(run_finished_);
-      }
-      if (graph_capture_stream_ != nullptr) {
-        StreamDestroy(graph_capture_stream_);
-      }
-      if (graph_exec_ != nullptr) {
-        GraphExecDestroy(graph_exec_);
-      }
-    }
-    Model(Model&& other) {
-      run_finished_ = other.run_finished_;
-      graph_exec_ = other.graph_exec_;
-      graph_capture_stream_ = other.graph_capture_stream_;
-      other.run_finished_ = nullptr;
-      other.graph_exec_ = nullptr;
-      other.graph_capture_stream_ = nullptr;
-      constants_ = other.constants_;
-      num_inputs_ = other.num_inputs_;
-      global_workspace_ = other.global_workspace_;
-      unique_workspace_ = other.unique_workspace_;
-      workspace_ = std::move(other.workspace_);
-      params_ = std::move(other.params_);
-      constant_name_to_ptr_ = std::move(other.constant_name_to_ptr_);
-      // Re-wire the pointers in the above 2 structures.
-      InitConstants(constants_);
-    }
-    Model& operator=(Model&&) = delete;
-    Model(const Model&) = delete;
-    Model& operator=(const Model&) = delete;
-    void SetUpInputsOutputs() {
-             input_0 = static_cast<decltype(input_0)>(params_[0].ptr);
-if (input_0 == nullptr) {
-    throw std::runtime_error("Constant input_0 was not set! Set the value with set_constant.");
-}
-     input_1 = static_cast<decltype(input_1)>(params_[1].ptr);
-if (input_1 == nullptr) {
-    throw std::runtime_error("Constant input_1 was not set! Set the value with set_constant.");
-}
-     output_0 = static_cast<decltype(output_0)>(params_[2].ptr);
-if (output_0 == nullptr) {
-  throw std::runtime_error("Constant output_0 was not set! Set the value with set_constant.");
-}
-    }
-    void DeviceToDeviceCopies(StreamType stream) {
-    }
-    void Run(StreamType stream, bool graph_mode) {
-      SetUpInputsOutputs();
-      if (target_has_graph_mode && graph_mode) {
-        RunAsGraph(stream);
-      } else {
-        RunImpl(stream);
-      }
-      DEVICE_CHECK(EventRecord(run_finished_, stream));
-    }
-    void RunImpl(StreamType stream) {
-    gemm_rrr_3(
-        input_0,
-        input_1,
-        output_0,
-        &input_0_dim_0,
-        &input_0_dim_1,
-        &input_1_dim_0,
-        &input_1_dim_1,
-        &input_0_dim_0,
-        &input_1_dim_1,
-        stream
-    );
-      DeviceCheckLastError(__FILE__, __LINE__);
-      DeviceToDeviceCopies(stream);
-    }
-    bool IsPending() {
-      auto query = QueryEvent(run_finished_);
-      if (query == GetDeviceNotReady()) {
-        return true;
-      }
-      if (query != GetDeviceSuccess()) {
-        LOG(WARNING) << "Pending model run did not finish successfully. Error: "
-                    << GetErrorString(query);
-      }
-      return false;
-    }
-    void WaitForCompletion() {
-      DEVICE_CHECK(EventSynchronize(run_finished_));
-    }
-    size_t NumInputs() const {
-      return num_inputs_;
-    }
-    size_t NumOutputs() const {
-      return num_outputs_;
-    }
-    void SetParam(const void* src, size_t param_idx) {
-      CHECK_VECTOR_ACCESS(params_, param_idx)
-      // const_cast is not ideal here, but it is unfortunately
-      // necessary:
-      // 1) We store outputs and inputs in the same vector,
-      //    and outputs cannot be const.
-      // 2) Most of the codegen is not const-correct (most ops
-      //    require non-const pointers). So even if we put const
-      //    pointers into params, a const_cast would be required
-      //    somewhere else.
-      params_[param_idx].ptr = const_cast<void*>(src);
-    }
-    void SetInput(const void* src, const AITemplateParamShape& shape, size_t idx) {
-      SetInputShape(shape, idx);
-      SetParam(src, idx);
-    }
-    void SetOutput(void* src, size_t idx) {
-      SetParam(src, idx + num_inputs_);
-    }
-    // Write the (possibly dynamic) output shape to the given pointer.
-    // Note that this should be called _after_ the shape inference in
-    // Run() is finished. output_shape_out should be able to store
-    // at least GetOutputMaximumShape(idx).size values.
-    void GetOutputShape(size_t idx, int64_t* output_shape_out) {
-      const auto param_idx = idx + num_inputs_;
-      CHECK_VECTOR_ACCESS(params_, param_idx);
-      const auto& shape_ptrs = params_[param_idx].shape_ptrs;
-      for (size_t i = 0; i < shape_ptrs.size(); ++i) {
-        output_shape_out[i] = shape_ptrs[i].GetValue();
-      }
-    }
-    void SetConstant(const char* name, const void* src) {
-      auto it = constant_name_to_ptr_.find(name);
-      if (it == constant_name_to_ptr_.end()) {
-        throw std::out_of_range(std::string("Could not find constant ") + name);
-      }
-      const void** ptr = it->second;
-      *ptr = src;
-      }
-  private:
-    void InitConstants(uint8_t* constants) {
-           params_[0].shape_ptrs = {ParamDim(256, 256, &input_0_dim_0), ParamDim(128, 128, &input_0_dim_1)};
-     params_[1].shape_ptrs = {ParamDim(128, 128, &input_1_dim_0), ParamDim(32, 32, &input_1_dim_1)};
-     params_[2].shape_ptrs = {ParamDim(256, 256, &input_0_dim_0), ParamDim(32, 32, &input_1_dim_1)};
-    }
-    void SetInputShape(const AITemplateParamShape& shape, size_t idx) {
-      auto& param = params_[idx];
-      if (shape.size != param.shape_ptrs.size()) {
-        throw std::runtime_error(
-          "[SetInputShape] Got wrong param shape for input " + std::to_string(idx) +
-          "; expected " + std::to_string(param.shape_ptrs.size()) + ", got " +
-          std::to_string(shape.size));
-      }
-      for (size_t i = 0; i < param.shape_ptrs.size(); ++i) {
-        param.shape_ptrs[i].SetValue(shape.shape_data[i]);
-      }
-    }
-    DeviceError EndCapture(GraphType* graph_ptr) {
-      auto err = StreamEndCapture(graph_capture_stream_, graph_ptr);
-      if (err != GetDeviceSuccess()) {
-        // If we can't take the stream out of capture mode, something is probably
-        // wrong with CUDA graph for this model (e.g. there might have been an
-        // illegal capture mode operation). Disable graph mode to avoid such issues
-        // in future iterations.
-        target_has_graph_mode = false;
-        LOG(WARNING) << "Graph capture failed to end. Disabling graph mode.";
-        return err;
-      }
-      return GetDeviceSuccess();
-    }
-    void RunAsGraph(StreamType stream) {
-      DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
-      try {
-        RunImpl(graph_capture_stream_);
-      } catch (...) {
-        GraphType graph;
-        // No need to DEVICE_CHECK here, we want to see the original exception.
-        EndCapture(&graph);
-        if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
-          LOG(WARNING) << "Graph destruction failed while handling exception! Memory will be leaked.";
-        }
-        throw;
-      }
-      // The following function ends the capture and creates a graph
-      // inside a unique_ptr that cleans up it when it goes out of scope.
-      // Note that it throws an exception if EndCapture fails.
-      auto graph = RAII_EndCaptureAndCreateGraph(
-        [this](GraphType* graph_ptr){ return EndCapture(graph_ptr); }
-      );
-      if (graph_exec_ == nullptr) {
-        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
-      } else if (GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
-        // Consume the last cuda error, which may affect the next GraphExecLaunch
-        // call.
-        GetLastError();
-        DEVICE_CHECK(GraphExecDestroy(graph_exec_));
-        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
-      }
-      DEVICE_CHECK(GraphExecLaunch(graph_exec_, stream));
-    }
-    int device_idx_;
-    int max_smem_size_{0};
-    DevicePropertyType device_properties_;
-    // This event tracks when the inference is finished
-    // so that this Model may be reclaimed by its owning
-    // ModelContainer.
-    EventType run_finished_;
-    // A blob of memory used for storing intermediate tensors.
-    GPUPtr blob_;
-    // Memory for constants that were folded into the *.so. Unowned by Model,
-    // owned by ModelContainer.
-    // TODO: make this const. It can't be const right now because we derive
-    // tensor pointers from it, and no tensor pointers are const.
-    uint8_t* constants_;
-    size_t num_inputs_;
-    size_t num_outputs_;
-    // The workspace blob is used as scratch memory. See
-    // _generate_workspace in memory planning for more information.
-    GPUPtr workspace_;
-    uint8_t* global_workspace_{nullptr};
-    uint8_t* unique_workspace_{nullptr};
-    class ParamDim {
-      public:
-        ParamDim(int64_t lower_bound, int64_t upper_bound, int64_t* value) :
-          lower_bound_(lower_bound),
-          upper_bound_(upper_bound),
-          value_(value) {}
-        void SetValue(int64_t new_value) {
-          if (new_value < lower_bound_ || new_value > upper_bound_) {
-            throw std::out_of_range(
-              "[SetValue] Dimension got value out of bounds; expected value to be in [" +
-              std::to_string(lower_bound_) + ", " + std::to_string(upper_bound_) + "], but got " +
-              std::to_string(new_value)
-            );
-          }
-          *value_ = new_value;
-        }
-        int64_t GetValue() const {
-          return *value_;
-        }
-      private:
-        int64_t lower_bound_;
-        int64_t upper_bound_;
-        int64_t* value_;
-        };
-    struct ParamInfo {
-      void* ptr = nullptr;
-      // TODO add offset
-      const char* name;
-      std::vector<ParamDim> shape_ptrs;
-    };
-    // Contains info for all tensors marked as inputs
-    // or outputs. The first num_inputs elements are the inputs.
-    // Constants are not included.
-    std::vector<ParamInfo> params_;
-    GraphExecType graph_exec_ = nullptr;
-    StreamType graph_capture_stream_;
-    std::unordered_map<std::string, const void**> constant_name_to_ptr_;
-   void * input_0 {nullptr};
-   void * input_1 {nullptr};
-   void * output_0 {nullptr};
-   int64_t input_0_dim_0 { 256 };
-   int64_t input_0_dim_1 { 128 };
-   int64_t input_1_dim_0 { 128 };
-   int64_t input_1_dim_1 { 32 };
-};
-} // namespace ait
\ No newline at end of file
--- a/python/ait_impl/samples/model-generated.h
+++ b/python/ait_impl/samples/model-generated.h
--- a/python/ait_impl/samples/model_container.cpp
+++ b/python/ait_impl/samples/model_container.cpp