added preliminary version of instance generator for MIGraphX - generates...

added preliminary version of instance generator for MIGraphX - generates string of instances for device_gemm_bilinear example

added preliminary version of instance generator for MIGraphX - generates...
added preliminary version of instance generator for MIGraphX - generates string of instances for device_gemm_bilinear example
5714d3c6 · Astha Rai · 475188ca · 5714d3c6 · 5714d3c6 · 5714d3c6
Commit 5714d3c6 authored Sep 13, 2023 by Astha Rai
20 changed files
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/driver.py
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/driver.py
+import enum
+import ck_types
+from copy import deepcopy
+from dataclasses import dataclass
+from enum import auto
+from typing import List
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import subprocess
+import re
+import gemm_op
+from gemm_op import *
+import user
+from ck_types import *
+from gemm_ex import *
+from make_template import *
+# holds multiple gemm instances
+op_collection = user.CreateGemmOperator()
+instances = []
+for op in op_collection:
+    instances.append((str(op.tile_desc.block_size) + "_" + str(op.tile_desc.m_per_block) + "_" + str(op.tile_desc.n_per_block) + "_" + str(op.tile_desc.k_per_block) + "_" + str(op.tile_desc.k1) + ".o "))
+    x = EmitGemmInstance()
+    x.emit(op)
+m = EmitMake()
+m.emit(instances)
+#print(str(instances))
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/gemm_ex.py
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/gemm_ex.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import subprocess
+import re
+import gemm_op
+from gemm_op import *
+import user
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+class EmitGemmInstance:
+    def __init__(self):
+        self.gemm_devop_template =     """
+#pragma once
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
+            ${type_a},
+            ${type_b},
+            ${type_c},
+            ${type_acc},
+            ${layout_a},
+            ${layout_b},
+            ${layout_c},
+            ${elementwise_op_a},
+            ${elementwise_op_b},
+            ${elementwise_op_c},
+            ${Gemm_spec},
+            ${block_size},
+            ${mperblock},
+            ${nperblock},
+            ${k0perblock},
+            ${k1},
+            ${m1perthread},
+            ${n1perthread},
+            ${kperthread},
+            ${m1n1_thcluster_m1xs},
+            ${m1n1_thcluster_n1xs},
+            ${ABT_thread_slice_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
+            ${ABT_thread_cluster_arrange_order},
+            ${ABT_src_access_order},
+            ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
+            ${ABT_src_vec_tensor_cont_dim_order},
+            ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
+            ${BBT_thread_slice_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
+            ${BBT_thread_cluster_arrange_order},
+            ${BBT_src_access_order},
+            ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
+            ${BBT_src_vec_tensor_cont_dim_order},
+            ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
+            ${CTT_src_dst_access_order},
+            ${CTT_src_dst_vec_dim},
+            ${CTT_dst_scalar_per_vector}>;
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+bool run_gemm_${name}(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+    Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
+    Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
+    }
+    Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    auto a_element_op = ${elementwise_op_a}{};
+    auto b_element_op = ${elementwise_op_b}{};
+    auto c_element_op = ${elementwise_op_c}{};
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+        static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+        return true;
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+        ref_invoker.Run(ref_argument);
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+    return true;
+}
+bool run_gemm_${name}(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_${name}(problem_size, config);
+}
+"""
+    def emit(self,operation):
+        name = (str(operation.tile_desc.block_size) + "_" + str(operation.tile_desc.m_per_block) + "_" + str(operation.tile_desc.n_per_block)+ "_" + str(operation.tile_desc.k_per_block) + "_" + str(operation.tile_desc.k1))
+        values = {
+            'name' : name,
+            'type_a' : operation.A.element,
+            'type_b' : operation.B.element,
+            'type_c' : operation.C.element,
+            'type_acc' : 'float',
+            'layout_a' : operation.A.layout,
+            'layout_b' : operation.B.layout,
+            'layout_c' : operation.C.layout,
+            'elementwise_op_a' : operation.a_elem_op,
+            'elementwise_op_b' : operation.b_elem_op,
+            'elementwise_op_c' : operation.epilogue_functor,
+            'Gemm_spec' : operation.gemm_specialization,
+            'block_size' : str(operation.tile_desc.block_size),
+            'mperblock' : str(operation.tile_desc.m_per_block),
+            'nperblock' : str(operation.tile_desc.n_per_block),
+            'k0perblock' : str(operation.tile_desc.k_per_block),
+            'k1' : str(operation.tile_desc.k1),
+            'm1perthread' : str(operation.tile_desc.m_per_thread),
+            'n1perthread' : str(operation.tile_desc.n_per_thread),
+            'kperthread' : str(operation.tile_desc.k_per_thread),
+            'm1n1_thcluster_m1xs' : operation.tile_desc.m1n1_thcluster_m1xs,
+            'm1n1_thcluster_n1xs' : operation.tile_desc.m1n1_thcluster_n1xs,
+            'ABT_thread_slice_lengths_K0_M0_M1_K1' : operation.a_block_transfer.thread_slice_length,
+            'ABT_thread_cluster_lengths_K0_M0_M1_K1' : operation.a_block_transfer.thread_cluster_length,
+            'ABT_thread_cluster_arrange_order' : operation.a_block_transfer.thread_cluster_arrange_order,
+            'ABT_src_access_order' : operation.a_block_transfer.src_access_order,
+            'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : operation.a_block_transfer.src_vec_tensor_lengths,
+            'ABT_src_vec_tensor_cont_dim_order' : operation.a_block_transfer.src_vec_tensor_cont_dim_order,
+            'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : operation.a_block_transfer.dst_vec_tensor_lengths,
+            'BBT_thread_slice_lengths_K0_N0_N1_K1' : operation.b_block_transfer.thread_slice_length,
+            'BBT_thread_cluster_lengths_K0_N0_N1_K1' : operation.b_block_transfer.thread_cluster_length,
+            'BBT_thread_cluster_arrange_order' :  operation.b_block_transfer.thread_cluster_arrange_order,
+            'BBT_src_access_order' : operation.b_block_transfer.src_access_order,
+            'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : operation.b_block_transfer.src_vec_tensor_lengths,
+            'BBT_src_vec_tensor_cont_dim_order' : operation.b_block_transfer.src_vec_tensor_cont_dim_order,
+            'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': operation.b_block_transfer.dst_vec_tensor_lengths,
+            'CTT_src_dst_access_order' : operation.c_block_transfer.src_dst_access_order,
+            'CTT_src_dst_vec_dim' : str(operation.c_block_transfer.src_dst_vec_dim),
+            'CTT_dst_scalar_per_vector' : str(operation.c_block_transfer.dst_scalar_per_vector),
+        }
+        template = self.gemm_devop_template
+        # name = (str(operation.tile_desc.block_size) + "_" + str(operation.tile_desc.m_per_block) + "_" + str(operation.tile_desc.n_per_block)
+        # + "_" + str(operation.tile_desc.k_per_block) + "_" + str(operation.tile_desc.k1))
+        cf = open("%s.cpp" % name,'w')
+        cf.write(SubstituteTemplate(template, values))
+        cf.close()
\ No newline at end of file
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/gemm_op.py
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/gemm_op.py
+# take in input for gemm from user, send it to example template
+# the structure for constructing this gemm op was taken from AIT's 
+# implementation of creating a gemm op
+import enum
+import ck_types
+from copy import deepcopy
+from dataclasses import dataclass
+from enum import auto
+from typing import List
+from ck_types import *
+class GemmType():
+    GemmDefault = "ck::tensor_operation::device::GemmSpecialization::Default"
+@dataclass
+class TileDesc:
+    block_size: int
+    m_per_block: int
+    n_per_block: int
+    k_per_block: int
+    k1: int
+    m_per_thread: int
+    n_per_thread: int
+    k_per_thread: int
+    m1n1_thcluster_m1xs: str
+    m1n1_thcluster_n1xs: str
+    def __str__(self) -> str:
+        values = list(self.__dict__.values())
+        return "_".join([str(x) for x in values])
+        return template.render(param=args)
+@dataclass
+class BlockTransferDesc:
+    thread_slice_length: str
+    thread_cluster_length: str
+    thread_cluster_arrange_order: str
+    src_access_order: str
+    src_vec_tensor_lengths: str
+    src_vec_tensor_cont_dim_order: str
+    dst_vec_tensor_lengths: str
+    def __str__(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["thread_cluster_length"] = [str(x) for x in self.thread_cluster_length]
+        args["thread_cluster_arrange_order"] = [
+            str(x) for x in self.thread_cluster_arrange_order
+        ]
+        args["src_access_order"] = [str(x) for x in self.src_access_order]
+@dataclass
+class CBlockTransferDesc:
+    src_dst_access_order: str
+    src_dst_vec_dim: int
+    dst_scalar_per_vector: int
+    def __str__(self) -> str:
+        args = deepcopy(self.__dict__)
+        #args["m_n_block_wave_per_xdl"] = [str(x) for x in self.m_n_block_wave_per_xdl]
+@dataclass
+class GemmOperation:
+    A: TensorDesc
+    B: TensorDesc
+    C: TensorDesc
+    a_elem_op: TensorOperation
+    b_elem_op: TensorOperation
+    epilogue_functor: TensorOperation
+    gemm_specialization: GemmType #GemmSpecialization
+    tile_desc: TileDesc
+    a_block_transfer: BlockTransferDesc
+    b_block_transfer: BlockTransferDesc
+    b1_block_transfer: BlockTransferDesc = None
+    c_block_transfer: CBlockTransferDesc = None
+    def __str__(self) -> str:
+        io_name = "{gemm_kind}_{gemm_specialization}_{a_dtype}{b_dtype}{c_dtype}_{a_layout}{b_layout}{c_layout}".format(
+            #gemm_kind=library.GemmKindNames[self.operation_kind],
+            gemm_specialization=self.gemm_specialization.value,
+            a_dtype=[self.A.element],
+            b_dtype=[self.B.element],
+            c_dtype=[self.C.element],
+            a_layout=[self.A.layout],
+            b_layout=[self.B.layout],
+            c_layout=[self.C.layout],
+        )
+        extra_tile = ""
+        if self.c_block_transfer is not None:
+            if self.c_block_transfer.scalar_per_vector == 4:
+                extra_tile = "_C4"
+            elif self.c_block_transfer.scalar_per_vector == 1:
+                extra_tile = "_C1"
+        tile_name = str(self.tile_desc) + extra_tile
+        return "{io_name}_{tile_name}_{epilogue_functor}".format(
+            io_name=io_name,
+            tile_name=tile_name,
+            epilogue_functor=[self.epilogue_functor],
+        )
+    def accumulator_type(self):
+        return DataType.f16 #f.32?
+if __name__ == "__main__":
+    A = TensorDesc(DataType.f16, Layout.RowMajor)
+    B = TensorDesc(DataType.f16, Layout.ColumnMajor)
+    C = TensorDesc(DataType.f16, Layout.RowMajor)
+    GemmOp = GemmOperation(
+        A=A,
+        B=B,
+        C=C,
+        a_elem_op=TensorOperation.PassThrough,
+        b_elem_op=TensorOperation.PassThrough,
+        epilogue_functor=TensorOperation.PassThrough,
+        gemm_specialization=GemmType.GemmDefault,
+        tile_desc=TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 4, 2),
+        a_block_transfer=BlockTransferDesc(
+            [4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1, True
+        ),
+        b_block_transfer=BlockTransferDesc(
+            [8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 1, 0, True
+        ),
+        c_block_transfer=CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        #ds_dtype=[DataType.f16],
+    )
+    print(GemmOp.a_elem_op)
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/main.cpp
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/main.cpp
+#include "run.h"
+int main(int argc, char* argv[]) 
+{ 
+    //return !run_gemm_example(argc, argv); 
+    run_gemm_128_32_128_8_2(argc, argv);
+    run_gemm_128_64_32_8_2(argc, argv);
+    run_gemm_128_64_128_8_2(argc, argv);
+    run_gemm_128_128_32_8_2(argc, argv);
+    run_gemm_128_128_64_8_2(argc, argv);
+    run_gemm_256_64_128_8_2(argc, argv);
+    run_gemm_256_128_64_8_2(argc, argv);
+    run_gemm_256_128_128_8_2(argc, argv);
+    run_gemm_256_128_128_16_2(argc, argv);
+}
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/main.txt
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/main.txt
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
\ No newline at end of file
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/make_template.py
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/make_template.py
+import enum
+import os.path
+import shutil
+import functools
+import operator
+import collections
+import subprocess
+import re
+import gemm_op
+from gemm_op import *
+import user
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+class EmitMake:
+    def __init__(self):
+        self.make_template =  """
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
+CXXFLAGS = -std=c++17
+device_memory.o: ../../../../../library/src/utility/device_memory.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+main.o: main.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  main.cpp
+obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
+%.o : %.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
+done: libtest.so
+	cp libtest.so /lib
+main: main.o device_memory.o host_tensor.o $(obj_files)
+	hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
+libtest.so: $(obj_files) host_tensor.o device_memory.o
+	hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
+all: done main.o
+	hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python/ait_impl/generation/ex/shared -l test main.o -o example
+clean:
+	rm -f *.o libtest.so example
+    """
+    def emit(self, instances):
+        obj_files = instances
+        values = {
+            'obj_files' : str(instances)
+        }
+        m_template = self.make_template
+        cf = open("Makefile", 'w')
+        cf.write(SubstituteTemplate(m_template, values))
+        cf.close()
+        PIPE = -1
+        STDOUT = -2
+        proc = subprocess.Popen(
+        ["make all"],
+        shell=True,
+        env=os.environ.copy(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        )
+        out, err = proc.communicate()
\ No newline at end of file
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/makefile.txt
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/makefile.txt
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+CXXFLAGS = -std=c++17
+device_memory.o: ../../../../../library/src/utility/device_memory.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+main.o: main.cpp
+	        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  main.cpp
+obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
+%.o : %.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $< 
+all: libtest.so
+	cp libtest.so /lib
+done: libtest.so main.o
+	hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python/ait_impl/generation/ex/shared -l test main.o -o done
+main: main.o device_memory.o host_tensor.o $(obj_files)
+	        hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
+libtest.so: $(obj_files) host_tensor.o device_memory.o
+	hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
+clean:
+	rm -f *.o libtest.so done main
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/makefile2.txt
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/makefile2.txt
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+CXXFLAGS = -std=c++17
+device_memory.o: ../../../../../library/src/utility/device_memory.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
+%.o : %.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $< 
+all: main.o device_memory.o host_tensor.o $(obj_files)
+	hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o done
+main.o: main.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  main.cpp
+clean:
+	rm -f *.o test.so
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/makefile_work.txt
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/makefile_work.txt
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
+CXXFLAGS = -std=c++17
+device_memory.o: ../../../../../library/src/utility/device_memory.cpp
+        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
+        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+main.o: main.cpp
+        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  main.cpp
+obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
+%.o : %.cpp
+        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
+done: libtest.so
+        cp libtest.so /lib
+main: main.o device_memory.o host_tensor.o $(obj_files)
+        hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
+libtest.so: $(obj_files) host_tensor.o device_memory.o
+        hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
+all: done main.o
+        hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python/ait_impl/generation/ex/shared -l test main.o -o example
+clean:
+        rm -f *.o libtest.so example
\ No newline at end of file
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/run.h
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/run.h
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+bool run_gemm_128_32_64_8_2(int argc, char* argv[]);
+bool run_gemm_128_32_128_8_2(int argc, char* argv[]);
+bool run_gemm_128_64_32_8_2(int argc, char* argv[]);
+bool run_gemm_128_64_128_8_2(int argc, char* argv[]);
+bool run_gemm_128_128_32_8_2(int argc, char* argv[]);
+bool run_gemm_128_128_64_8_2(int argc, char* argv[]);
+bool run_gemm_256_64_128_8_2(int argc, char* argv[]);
+bool run_gemm_256_128_64_8_2(int argc, char* argv[]);
+bool run_gemm_256_128_128_8_2(int argc, char* argv[]);
+bool run_gemm_256_128_128_16_2(int argc, char* argv[]);
\ No newline at end of file
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/user.py
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/user.py
+# the structure for creating a list of instances for an op 
+# was taken from Meta's AIT library 
+import gemm_op as gemm
+import enum
+from dataclasses import dataclass
+from enum import auto
+import ck_types
+from ck_types import *
+def CreateGemmOperator():
+    #operation_kind = library.GemmKind.Gemm
+    a_element_desc = TensorDesc(
+       DataType.f16, Layout.ColumnMajor
+    )
+    b_element_desc = TensorDesc(
+        DataType.f16, Layout.RowMajor
+    )
+    c_element_desc = TensorDesc(
+        DataType.f16,Layout.RowMajor
+    )
+    element_op = TensorOperation.PassThrough
+    tile_descriptions = [
+        gemm.TileDesc(256, 128, 128, 16, 2, 4, 4, 1, "S<8, 2>", "S<8, 2>"),
+        gemm.TileDesc(256, 128, 128, 8, 2, 4, 4, 1, "S<8, 2>", "S<8, 2>"),
+        gemm.TileDesc(128, 64, 128, 8, 2, 4, 4, 1, "S<4, 2>", "S<8, 2>"),
+        gemm.TileDesc(128, 128, 64, 8, 2, 4, 4, 1, "S<8, 2>", "S<4, 2>"),
+        gemm.TileDesc(256, 64, 128, 8, 2, 2, 4, 1, "S<8, 2>", "S<8, 2>"),
+        gemm.TileDesc(256, 128, 64, 8, 2, 4, 2, 1, "S<8, 2>", "S<8, 2>"),
+        gemm.TileDesc(128, 32, 128, 8, 2, 2, 4, 1, "S<4, 2>", "S<8, 2>"),
+        gemm.TileDesc(128, 128, 32, 8, 2, 4, 2, 1, "S<8, 2>", "S<4, 2>"),
+        gemm.TileDesc(128, 32, 64, 8, 2, 2, 2, 1, "S<4, 2>", "S<8, 2>"),
+        gemm.TileDesc(128, 64, 32, 8, 2, 2, 2, 1, "S<8, 2>", "S<4, 2>"),
+    ]
+    a_block_descriptions = [
+        gemm.BlockTransferDesc("S<2, 1, 4, 2>", "S<8, 1, 32, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 4, 2>", "S<8, 1, 32, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 4, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 8, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 2, 2>", "S<8, 1, 32, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 2, 1>", "S<0, 3, 1, 2>", "S<1, 1, 2, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 4, 2>", "S<8, 1, 32, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 2, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 2, 1>", "S<0, 3, 1, 2>", "S<1, 1, 2, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 8, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 2, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 2, 1>", "S<0, 3, 1, 2>", "S<1, 1, 2, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 4, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+    ]
+    b_block_descriptions = [
+        gemm.BlockTransferDesc("S<2, 1, 4, 2>", "S<8, 1, 32, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 4, 2>", "S<8, 1, 32, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 8, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 4, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 4, 2>", "S<8, 1, 32, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 2, 2>", "S<8, 1, 32, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 2, 1>", "S<0, 3, 1, 2>", "S<1, 1, 2, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 8, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 2, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 2, 1>", "S<0, 3, 1, 2>", "S<1, 1, 2, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 4, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 4, 1>", "S<0, 3, 1, 2>", "S<1, 1, 4, 2>"),
+        gemm.BlockTransferDesc("S<1, 1, 2, 2>", "S<8, 1, 16, 1>", "S<0, 3, 1, 2>", "S<0, 3, 1, 2>", "S<1, 1, 2, 1>", "S<0, 3, 1, 2>", "S<1, 1, 2, 2>"),
+    ]
+    c_block_descriptions = [
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 4),
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 4),
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 4),
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 4),
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 4),
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 2),
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 4),
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 2),
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 4),
+        gemm.CBlockTransferDesc("S<0, 1, 2, 3, 4, 5>", 5, 2),
+    ]
+    #a_block_descriptions = b_block_descriptions
+    #c_block_descriptions = []
+    # AIT logic, adapt later
+    # for t in tile_descriptions:
+    #     a_block_transfer = -1
+    #     c_block_transfer = -1
+    #     if t.block_size == 256:
+    #         a_block_transfer = [4, 64, 1]
+    #         c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+    #     if t.block_size == 128:
+    #         a_block_transfer = [4, 32, 1]
+    #         if t.n_per_block == 128:
+    #             c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+    #         if t.n_per_block == 64:
+    #             c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+    #     assert (
+    #         a_block_transfer != -1
+    #         and c_block_transfer != -1
+    #         and "Cannot determine block_transfer_size with block_size "
+    #         + str(t.block_size)
+    #     )
+    #     a_block_descriptions.append(
+    #         gemm.BlockTransferDesc(a_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+    #     )
+    #     c_block_descriptions.append(c_block_transfer)
+    gemm_specialization = [
+        gemm.GemmType.GemmDefault
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+            tile_descriptions,
+            a_block_descriptions,
+            b_block_descriptions,
+            c_block_descriptions,
+        ):
+            new_operation = gemm.GemmOperation(
+                #operation_kind=operation_kind,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+                c_block_transfer=c_block_desc,
+            )
+            #manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+    print (operations[0].tile_desc)
--- a/instance_gen/AIT_impl/generation/ex_version_gemm/shared/xformer-install.bat
+++ b/instance_gen/AIT_impl/generation/ex_version_gemm/shared/xformer-install.bat
+git config --global --add safe.directory /root/workspace/xformers-rocm
+git config --global --add safe.directory /root/workspace/xformers-rocm/third_party/composable_kernel
+git config --global --add safe.directory /root/workspace/xformers-rocm/third_party/cutlass
+git config --global --add safe.directory /root/workspace/xformers-rocm/third_party/flash-attention
+git submodule update --init --recursive
+pip install -r requirements.txt
+pip install -U matplotlib
+pip install pandas
+pip install seaborn
+pip install triton
+pip install -e  ./
\ No newline at end of file
--- a/instance_gen/AIT_impl/generation/instance/__pycache__/ck_types.cpython-310.pyc
+++ b/instance_gen/AIT_impl/generation/instance/__pycache__/ck_types.cpython-310.pyc
--- a/instance_gen/AIT_impl/generation/instance/__pycache__/ck_types.cpython-36.pyc
+++ b/instance_gen/AIT_impl/generation/instance/__pycache__/ck_types.cpython-36.pyc
--- a/instance_gen/AIT_impl/generation/instance/__pycache__/gemm_ex.cpython-310.pyc
+++ b/instance_gen/AIT_impl/generation/instance/__pycache__/gemm_ex.cpython-310.pyc
--- a/instance_gen/AIT_impl/generation/instance/__pycache__/gemm_ex.cpython-36.pyc
+++ b/instance_gen/AIT_impl/generation/instance/__pycache__/gemm_ex.cpython-36.pyc
--- a/instance_gen/AIT_impl/generation/instance/__pycache__/gemm_op.cpython-310.pyc
+++ b/instance_gen/AIT_impl/generation/instance/__pycache__/gemm_op.cpython-310.pyc
--- a/instance_gen/AIT_impl/generation/instance/__pycache__/gemm_op.cpython-36.pyc
+++ b/instance_gen/AIT_impl/generation/instance/__pycache__/gemm_op.cpython-36.pyc
--- a/instance_gen/AIT_impl/generation/instance/__pycache__/make_template.cpython-310.pyc
+++ b/instance_gen/AIT_impl/generation/instance/__pycache__/make_template.cpython-310.pyc
--- a/instance_gen/AIT_impl/generation/instance/__pycache__/user.cpython-310.pyc
+++ b/instance_gen/AIT_impl/generation/instance/__pycache__/user.cpython-310.pyc