added compilation of shared library and multiple instances for gemm, cleaned up code design

47cc9b7e · Astha Rai · adbefd90 · 47cc9b7e · 47cc9b7e · 47cc9b7e
Commit 47cc9b7e authored May 26, 2023 by Astha Rai
20 changed files
--- a/python/ait_impl/generation/ex/shared/256.cpp
+++ b/python/ait_impl/generation/ex/shared/256.cpp
@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;


-bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+bool run_gemm_256_128_128_16_2(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
    using namespace ck::literals;

@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
    return true;
 }

-bool run_gemm_example(int argc, char* argv[])
+bool __attribute__((visibility("default"))) run_gemm_256_128_128_16_2(int argc, char* argv[])
 {
    ProblemSize problem_size;
    ExecutionConfig config;

-    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_128_128_16_2(problem_size, config);
 }

-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/python/ait_impl/generation/ex/normal/ex.cpp
+++ b/python/ait_impl/generation/ex/normal/ex.cpp
@@ -35,21 +35,21 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
            256,
            128,
            128,
-            16,
+            8,
            2,
            4,
            4,
            1,
            S<8, 2>,
            S<8, 2>,
-            S<2, 1, 4, 2>,
-            S<8, 1,  32, 1>,
+            S<1, 1, 4, 2>,
+            S<8, 1, 32, 1>,
            S<0, 3, 1, 2>,
            S<0, 3, 1, 2>,
            S<1, 1, 4, 1>,
            S<0, 3, 1, 2>,
            S<1, 1, 4, 2>,
-            S<2, 1, 4, 2>,
+            S<1, 1, 4, 2>,
            S<8, 1, 32, 1>,
            S<0, 3, 1, 2>,
            S<0, 3, 1, 2>,
@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;


-bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+bool run_gemm_256_128_128_8_2(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
    using namespace ck::literals;

@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
    return true;
 }

-bool run_gemm_example(int argc, char* argv[])
+bool __attribute__((visibility("default"))) run_gemm_256_128_128_8_2(int argc, char* argv[])
 {
    ProblemSize problem_size;
    ExecutionConfig config;

-    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_128_128_8_2(problem_size, config);
 }

-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/python/ait_impl/generation/ex/shared/128.cpp
+++ b/python/ait_impl/generation/ex/shared/128.cpp
@@ -32,39 +32,39 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
            ck::tensor_operation::element_wise::PassThrough,
            ck::tensor_operation::element_wise::PassThrough,
            ck::tensor_operation::device::GemmSpecialization::Default,
+            256,
            128,
-            128,
-            128,
-            32,
+            64,
+            8,
+            2,
+            4,
            2,
-            32,
-            32,
            1,
            S<8, 2>,
            S<8, 2>,
-            S<2, 1, 4, 2>,
+            S<1, 1, 4, 2>,
            S<8, 1, 32, 1>,
            S<0, 3, 1, 2>,
            S<0, 3, 1, 2>,
            S<1, 1, 4, 1>,
            S<0, 3, 1, 2>,
            S<1, 1, 4, 2>,
-            S<2, 1, 4, 2>,
+            S<1, 1, 2, 2>,
            S<8, 1, 32, 1>,
            S<0, 3, 1, 2>,
            S<0, 3, 1, 2>,
-            S<1, 1, 4, 1>,
+            S<1, 1, 2, 1>,
            S<0, 3, 1, 2>,
-            S<1, 1, 4, 2>,
-            S<1, 2, 3, 5, 5, 6>,
-            6,
-            5>;
+            S<1, 1, 2, 2>,
+            S<0, 1, 2, 3, 4, 5>,
+            5,
+            2>;

    using ReferenceGemmInstance = ck::tensor_operation::host::
        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;


-bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+bool run_gemm_256_128_64_8_2(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
    using namespace ck::literals;

@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
    return true;
 }

-bool run_gemm_example(int argc, char* argv[])
+bool __attribute__((visibility("default"))) run_gemm_256_128_64_8_2(int argc, char* argv[])
 {
    ProblemSize problem_size;
    ExecutionConfig config;

-    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_128_64_8_2(problem_size, config);
 }

-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/python/ait_impl/generation/ex/ex.cpp
+++ b/python/ait_impl/generation/ex/ex.cpp
@@ -33,23 +33,23 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
            ck::tensor_operation::element_wise::PassThrough,
            ck::tensor_operation::device::GemmSpecialization::Default,
            256,
+            64,
            128,
-            128,
-            16,
+            8,
+            2,
            2,
-            4,
            4,
            1,
            S<8, 2>,
            S<8, 2>,
-            S<2, 1, 4, 2>,
-            S<8, 1,  32, 1>,
+            S<1, 1, 2, 2>,
+            S<8, 1, 32, 1>,
            S<0, 3, 1, 2>,
            S<0, 3, 1, 2>,
-            S<1, 1, 4, 1>,
+            S<1, 1, 2, 1>,
            S<0, 3, 1, 2>,
+            S<1, 1, 2, 2>,
            S<1, 1, 4, 2>,
-            S<2, 1, 4, 2>,
            S<8, 1, 32, 1>,
            S<0, 3, 1, 2>,
            S<0, 3, 1, 2>,
@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;


-bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+bool run_gemm_256_64_128_8_2(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
    using namespace ck::literals;

@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
    return true;
 }

-bool run_gemm_example(int argc, char* argv[])
+bool __attribute__((visibility("default"))) run_gemm_256_64_128_8_2(int argc, char* argv[])
 {
    ProblemSize problem_size;
    ExecutionConfig config;

-    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_64_128_8_2(problem_size, config);
 }

-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
--- a/python/ait_impl/generation/ex/shared/Makefile
+++ b/python/ait_impl/generation/ex/shared/Makefile
-CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
-
-CXXFLAGS = -std=c++17
-
-device_memory.o: ../../../../../library/src/utility/device_memory.cpp
-	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
-
-host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
-	hipcc -shared -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
-
-obj_files = 256.o 
-
-%.o : %.cpp
-	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $< 
-
-all: test.so
-
-test.so: $(obj_files) host_tensor.o device_memory.o
-	hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
-
-clean:
-	rm -f *.o test.so
-
+
+
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
+
+CXXFLAGS = -std=c++17
+
+device_memory.o: ../../library/src/utility/device_memory.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../library/src/utility/host_tensor.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp
+
+main.o: main.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  main.cpp
+
+obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
+
+%.o : %.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
+
+done: libtest.so
+	cp libtest.so /lib
+
+main: main.o device_memory.o host_tensor.o $(obj_files)
+	hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
+
+libtest.so: $(obj_files) host_tensor.o device_memory.o
+	hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
+
+all: done main.o
+	hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python -l test main.o -o example
+
+clean:
+	rm -f *.o libtest.so example Makefile
+    
+
+    
\ No newline at end of file
--- a/python/ait_impl/generation/ex/shared/__pycache__/ck_types.cpython-38.pyc
+++ b/python/ait_impl/generation/ex/shared/__pycache__/ck_types.cpython-38.pyc
--- a/python/ait_impl/generation/ex/shared/__pycache__/gemm_ex.cpython-38.pyc
+++ b/python/ait_impl/generation/ex/shared/__pycache__/gemm_ex.cpython-38.pyc
--- a/python/ait_impl/generation/ex/shared/__pycache__/gemm_op.cpython-38.pyc
+++ b/python/ait_impl/generation/ex/shared/__pycache__/gemm_op.cpython-38.pyc
--- a/python/shared/__pycache__/make_template.cpython-38.pyc
+++ b/python/shared/__pycache__/make_template.cpython-38.pyc
--- a/python/shared/__pycache__/user.cpython-38.pyc
+++ b/python/shared/__pycache__/user.cpython-38.pyc
--- a/python/shared/ck_types.py
+++ b/python/shared/ck_types.py
+from dataclasses import dataclass
+
+class DataType:
+    f16 = "ck::half_t"
+
+class Layout:
+    ColumnMajor = "ck::tensor_layout::gemm::ColumnMajor"
+    RowMajor = "ck::tensor_layout::gemm::RowMajor"
+
+class TensorOperation:
+    PassThrough = "ck::tensor_operation::element_wise::PassThrough"
+
+@dataclass
+class TensorDesc: #set up and import properly
+    element: DataType
+    layout: Layout
+
--- a/python/ait_impl/generation/ex/shared/driver.py
+++ b/python/ait_impl/generation/ex/shared/driver.py
@@ -16,9 +16,15 @@ from gemm_op import *
 import user
 from ck_types import *
 from gemm_ex import *
+from make_template import *

 # holds multiple gemm instances
 op_collection = user.CreateGemmOperator()
+instances = []
 for op in op_collection:
+    instances.append((str(op.tile_desc.block_size) + "_" + str(op.tile_desc.m_per_block) + "_" + str(op.tile_desc.n_per_block) + "_" + str(op.tile_desc.k_per_block) + "_" + str(op.tile_desc.k1) + ".o "))
    x = EmitGemmInstance()
-    x.emit(op)
\ No newline at end of file
+    x.emit(op)
+m = EmitMake()
+m.emit(instances)
+#print(str(instances))
--- a/python/shared/example
+++ b/python/shared/example
--- a/python/ait_impl/generation/ex/shared/gemm_ex.py
+++ b/python/ait_impl/generation/ex/shared/gemm_ex.py
@@ -92,7 +92,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
        ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;


-bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+bool run_gemm_${name}(const ProblemSize& problem_size, const ExecutionConfig& config)
 {
    using namespace ck::literals;

@@ -200,18 +200,18 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
    return true;
 }

-bool run_gemm_example(int argc, char* argv[])
+bool __attribute__((visibility("default"))) run_gemm_${name}(int argc, char* argv[])
 {
    ProblemSize problem_size;
    ExecutionConfig config;

-    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_${name}(problem_size, config);
 }

-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
 """
    def emit(self,operation):
        values = {
+            'name' : (str(operation.tile_desc.block_size) + "_" + str(operation.tile_desc.m_per_block) + "_" + str(operation.tile_desc.n_per_block)+ "_" + str(operation.tile_desc.k_per_block) + "_" + str(operation.tile_desc.k1)),
            'type_a' : operation.A.element,
            'type_b' : operation.B.element,
            'type_c' : operation.C.element,
@@ -252,31 +252,10 @@ int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
            'CTT_dst_scalar_per_vector' : str(operation.c_block_transfer.dst_scalar_per_vector),
        }
        template = self.gemm_devop_template
-        name = str(operation.tile_desc.block_size)
+        name = (str(operation.tile_desc.block_size) + "_" + str(operation.tile_desc.m_per_block) + "_" + str(operation.tile_desc.n_per_block)
+        + "_" + str(operation.tile_desc.k_per_block) + "_" + str(operation.tile_desc.k1))
+        
        cf = open("%s.cpp" % name,'w')
-        print(SubstituteTemplate(template, values))
        cf.write(SubstituteTemplate(template, values))
        cf.close()
        
-# A = TensorDesc(DataType.f16, Layout.RowMajor)
-# B = TensorDesc(DataType.f16, Layout.ColumnMajor)
-# C = TensorDesc(DataType.f16, Layout.RowMajor)
-# gemm = gemm_op.GemmOperation(
-#     A=A,
-#     B=B,
-#     C=C,
-#     a_elem_op=TensorOperation.PassThrough,
-#     b_elem_op=TensorOperation.PassThrough,
-#     epilogue_functor=TensorOperation.PassThrough,
-#     gemm_specialization=GemmType.GemmDefault,
-#     tile_desc=TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 1, [8,2], [8,2]),
-#     a_block_transfer=BlockTransferDesc(
-#         [2, 1, 4, 2], [8, 1,  32, 1], [0, 3, 1, 2], [0, 3, 1, 2],[1, 1, 4, 1], [0, 3, 1, 2], [1, 1, 4, 2]
-#     ),
-#     b_block_transfer=BlockTransferDesc(
-#         [2, 1, 4, 2], [8, 1, 32, 1], [0, 3, 1, 2], [0, 3, 1, 2], [1, 1, 4, 1], [0, 3, 1, 2], [1, 1, 4, 2]
-#     ),
-#     c_block_transfer=CBlockTransferDesc([0, 1, 2, 3, 4, 5], 5, 4),
-# )
-# a = EmitGemmInstance()
-# a.emit(gemm)
--- a/python/ait_impl/generation/ex/shared/gemm_op.py
+++ b/python/ait_impl/generation/ex/shared/gemm_op.py
--- a/python/ait_impl/generation/ex/normal/input.py
+++ b/python/ait_impl/generation/ex/normal/input.py
--- a/python/shared/main.cpp
+++ b/python/shared/main.cpp
+#include "run.h"
+
+int main(int argc, char* argv[]) 
+{ 
+    //return !run_gemm_example(argc, argv); 
+    run_gemm_128_32_128_8_2(argc, argv);
+
+    run_gemm_128_64_32_8_2(argc, argv);
+
+    run_gemm_128_64_128_8_2(argc, argv);
+
+    run_gemm_128_128_32_8_2(argc, argv);
+
+    run_gemm_128_128_64_8_2(argc, argv);
+
+    run_gemm_256_64_128_8_2(argc, argv);
+
+    run_gemm_256_128_64_8_2(argc, argv);
+
+    run_gemm_256_128_128_8_2(argc, argv);
+
+    run_gemm_256_128_128_16_2(argc, argv);
+}
--- a/python/ait_impl/generation/ex/shared/make_template.py
+++ b/python/ait_impl/generation/ex/shared/make_template.py
@@ -26,38 +26,51 @@ def SubstituteTemplate(template, values):

 class EmitMake:
    def __init__(self):
-        self.make_template =     """
+        self.make_template =  """
+
 CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/

 CXXFLAGS = -std=c++17

-device_memory.o: ../../../../../library/src/utility/device_memory.cpp
-        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+device_memory.o: ../../library/src/utility/device_memory.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
+
+host_tensor.o: ../../library/src/utility/host_tensor.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp

-host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
-        hipcc -shared -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+main.o: main.cpp
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  main.cpp

-obj_files = 256.o
+obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o

 %.o : %.cpp
-        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
+	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
+
+done: libtest.so
+	cp libtest.so /lib

-all: test.so
+main: main.o device_memory.o host_tensor.o $(obj_files)
+	hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main

-test.so: $(obj_files) host_tensor.o device_memory.o
-        hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
+libtest.so: $(obj_files) host_tensor.o device_memory.o
+	hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
+
+all: done main.o
+	hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python -l test main.o -o example

 clean:
-        rm -f *.o test.so
+	rm -f *.o libtest.so example Makefile
+    
+
    """

-    def emit(self,operation):
+    def emit(self, instances):
+        obj_files = instances
        values = {
-            'temp' : ""
+            'obj_files' : str(instances)
        }
        m_template = self.make_template
        cf = open("Makefile", 'w')
-        print(SubstituteTemplate(m_template, values))
        cf.write(SubstituteTemplate(m_template, values))
        cf.close()

@@ -65,11 +78,11 @@ clean:
        STDOUT = -2

        proc = subprocess.Popen(
-        ["make"],
+        ["make all"],
        shell=True,
        env=os.environ.copy(),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        )

-        out, err = proc.communicate()
\ No newline at end of file
+        out, err = proc.communicate()
--- a/python/ait_impl/generation/ex/normal/Makefile
+++ b/python/ait_impl/generation/ex/normal/Makefile

-CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ 
+CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/  -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/

 CXXFLAGS = -std=c++17
-gemm: ex.o host_tensor.o device_memory.o
-	hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm

 device_memory.o: ../../../../../library/src/utility/device_memory.cpp
-	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
+        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp

 host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
-	hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
+        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp

-ex.o: 
-	hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  ex.cpp
+main.o: main.cpp
+        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c  main.cpp
+
+obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
+
+%.o : %.cpp
+        hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
+
+done: libtest.so
+        cp libtest.so /lib
+
+main: main.o device_memory.o host_tensor.o $(obj_files)
+        hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
+
+libtest.so: $(obj_files) host_tensor.o device_memory.o
+        hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
+
+all: done main.o
+        hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python/ait_impl/generation/ex/shared -l test main.o -o example
+
+
+clean:
+        rm -f *.o libtest.so example
    
\ No newline at end of file
--- a/python/shared/run.h
+++ b/python/shared/run.h
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+bool run_gemm_128_32_64_8_2(int argc, char* argv[]);
+
+bool run_gemm_128_32_128_8_2(int argc, char* argv[]);
+
+bool run_gemm_128_64_32_8_2(int argc, char* argv[]);
+
+bool run_gemm_128_64_128_8_2(int argc, char* argv[]);
+
+bool run_gemm_128_128_32_8_2(int argc, char* argv[]);
+
+bool run_gemm_128_128_64_8_2(int argc, char* argv[]);
+
+bool run_gemm_256_64_128_8_2(int argc, char* argv[]);
+
+bool run_gemm_256_128_64_8_2(int argc, char* argv[]);
+
+bool run_gemm_256_128_128_8_2(int argc, char* argv[]);
+
+bool run_gemm_256_128_128_16_2(int argc, char* argv[]);
\ No newline at end of file