"lightx2v_kernel/vscode:/vscode.git/clone" did not exist on "3e4fe79b5b9d7f2d49eac0f94f6e1f25dbf6f5be"
Commit 47cc9b7e authored by Astha Rai's avatar Astha Rai
Browse files

added compilation of shared library and multiple instances for gemm, cleaned up code design

parent adbefd90
...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_256_128_128_16_2(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_256_128_128_16_2(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_128_128_16_2(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -35,21 +35,21 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -35,21 +35,21 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
256, 256,
128, 128,
128, 128,
16, 8,
2, 2,
4, 4,
4, 4,
1, 1,
S<8, 2>, S<8, 2>,
S<8, 2>, S<8, 2>,
S<2, 1, 4, 2>, S<1, 1, 4, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 1>, S<1, 1, 4, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 2>, S<1, 1, 4, 2>,
S<2, 1, 4, 2>, S<1, 1, 4, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_256_128_128_8_2(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_256_128_128_8_2(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_128_128_8_2(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -32,39 +32,39 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -32,39 +32,39 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::device::GemmSpecialization::Default, ck::tensor_operation::device::GemmSpecialization::Default,
256,
128, 128,
128, 64,
128, 8,
32, 2,
4,
2, 2,
32,
32,
1, 1,
S<8, 2>, S<8, 2>,
S<8, 2>, S<8, 2>,
S<2, 1, 4, 2>, S<1, 1, 4, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 1>, S<1, 1, 4, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 2>, S<1, 1, 4, 2>,
S<2, 1, 4, 2>, S<1, 1, 2, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 1>, S<1, 1, 2, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 2>, S<1, 1, 2, 2>,
S<1, 2, 3, 5, 5, 6>, S<0, 1, 2, 3, 4, 5>,
6, 5,
5>; 2>;
using ReferenceGemmInstance = ck::tensor_operation::host:: using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_256_128_64_8_2(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_256_128_64_8_2(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_128_64_8_2(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -33,23 +33,23 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -33,23 +33,23 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::device::GemmSpecialization::Default, ck::tensor_operation::device::GemmSpecialization::Default,
256, 256,
64,
128, 128,
128, 8,
16, 2,
2, 2,
4,
4, 4,
1, 1,
S<8, 2>, S<8, 2>,
S<8, 2>, S<8, 2>,
S<2, 1, 4, 2>, S<1, 1, 2, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 1>, S<1, 1, 2, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 2, 2>,
S<1, 1, 4, 2>, S<1, 1, 4, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_256_64_128_8_2(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_256_64_128_8_2(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_64_128_8_2(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17 CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
device_memory.o: ../../../../../library/src/utility/device_memory.cpp CXXFLAGS = -std=c++17
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
device_memory.o: ../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
hipcc -shared -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
host_tensor.o: ../../library/src/utility/host_tensor.cpp
obj_files = 256.o hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp
%.o : %.cpp main.o: main.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $< hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c main.cpp
all: test.so obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
test.so: $(obj_files) host_tensor.o device_memory.o %.o : %.cpp
hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
clean: done: libtest.so
rm -f *.o test.so cp libtest.so /lib
main: main.o device_memory.o host_tensor.o $(obj_files)
hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
libtest.so: $(obj_files) host_tensor.o device_memory.o
hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
all: done main.o
hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python -l test main.o -o example
clean:
rm -f *.o libtest.so example Makefile
\ No newline at end of file
from dataclasses import dataclass
class DataType:
f16 = "ck::half_t"
class Layout:
ColumnMajor = "ck::tensor_layout::gemm::ColumnMajor"
RowMajor = "ck::tensor_layout::gemm::RowMajor"
class TensorOperation:
PassThrough = "ck::tensor_operation::element_wise::PassThrough"
@dataclass
class TensorDesc: #set up and import properly
element: DataType
layout: Layout
...@@ -16,9 +16,15 @@ from gemm_op import * ...@@ -16,9 +16,15 @@ from gemm_op import *
import user import user
from ck_types import * from ck_types import *
from gemm_ex import * from gemm_ex import *
from make_template import *
# holds multiple gemm instances # holds multiple gemm instances
op_collection = user.CreateGemmOperator() op_collection = user.CreateGemmOperator()
instances = []
for op in op_collection: for op in op_collection:
instances.append((str(op.tile_desc.block_size) + "_" + str(op.tile_desc.m_per_block) + "_" + str(op.tile_desc.n_per_block) + "_" + str(op.tile_desc.k_per_block) + "_" + str(op.tile_desc.k1) + ".o "))
x = EmitGemmInstance() x = EmitGemmInstance()
x.emit(op) x.emit(op)
\ No newline at end of file m = EmitMake()
m.emit(instances)
#print(str(instances))
...@@ -92,7 +92,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -92,7 +92,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_${name}(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -200,18 +200,18 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -200,18 +200,18 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_${name}(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_${name}(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
""" """
def emit(self,operation): def emit(self,operation):
values = { values = {
'name' : (str(operation.tile_desc.block_size) + "_" + str(operation.tile_desc.m_per_block) + "_" + str(operation.tile_desc.n_per_block)+ "_" + str(operation.tile_desc.k_per_block) + "_" + str(operation.tile_desc.k1)),
'type_a' : operation.A.element, 'type_a' : operation.A.element,
'type_b' : operation.B.element, 'type_b' : operation.B.element,
'type_c' : operation.C.element, 'type_c' : operation.C.element,
...@@ -252,31 +252,10 @@ int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } ...@@ -252,31 +252,10 @@ int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
'CTT_dst_scalar_per_vector' : str(operation.c_block_transfer.dst_scalar_per_vector), 'CTT_dst_scalar_per_vector' : str(operation.c_block_transfer.dst_scalar_per_vector),
} }
template = self.gemm_devop_template template = self.gemm_devop_template
name = str(operation.tile_desc.block_size) name = (str(operation.tile_desc.block_size) + "_" + str(operation.tile_desc.m_per_block) + "_" + str(operation.tile_desc.n_per_block)
+ "_" + str(operation.tile_desc.k_per_block) + "_" + str(operation.tile_desc.k1))
cf = open("%s.cpp" % name,'w') cf = open("%s.cpp" % name,'w')
print(SubstituteTemplate(template, values))
cf.write(SubstituteTemplate(template, values)) cf.write(SubstituteTemplate(template, values))
cf.close() cf.close()
# A = TensorDesc(DataType.f16, Layout.RowMajor)
# B = TensorDesc(DataType.f16, Layout.ColumnMajor)
# C = TensorDesc(DataType.f16, Layout.RowMajor)
# gemm = gemm_op.GemmOperation(
# A=A,
# B=B,
# C=C,
# a_elem_op=TensorOperation.PassThrough,
# b_elem_op=TensorOperation.PassThrough,
# epilogue_functor=TensorOperation.PassThrough,
# gemm_specialization=GemmType.GemmDefault,
# tile_desc=TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 1, [8,2], [8,2]),
# a_block_transfer=BlockTransferDesc(
# [2, 1, 4, 2], [8, 1, 32, 1], [0, 3, 1, 2], [0, 3, 1, 2],[1, 1, 4, 1], [0, 3, 1, 2], [1, 1, 4, 2]
# ),
# b_block_transfer=BlockTransferDesc(
# [2, 1, 4, 2], [8, 1, 32, 1], [0, 3, 1, 2], [0, 3, 1, 2], [1, 1, 4, 1], [0, 3, 1, 2], [1, 1, 4, 2]
# ),
# c_block_transfer=CBlockTransferDesc([0, 1, 2, 3, 4, 5], 5, 4),
# )
# a = EmitGemmInstance()
# a.emit(gemm)
#include "run.h"
int main(int argc, char* argv[])
{
//return !run_gemm_example(argc, argv);
run_gemm_128_32_128_8_2(argc, argv);
run_gemm_128_64_32_8_2(argc, argv);
run_gemm_128_64_128_8_2(argc, argv);
run_gemm_128_128_32_8_2(argc, argv);
run_gemm_128_128_64_8_2(argc, argv);
run_gemm_256_64_128_8_2(argc, argv);
run_gemm_256_128_64_8_2(argc, argv);
run_gemm_256_128_128_8_2(argc, argv);
run_gemm_256_128_128_16_2(argc, argv);
}
...@@ -26,38 +26,51 @@ def SubstituteTemplate(template, values): ...@@ -26,38 +26,51 @@ def SubstituteTemplate(template, values):
class EmitMake: class EmitMake:
def __init__(self): def __init__(self):
self.make_template = """ self.make_template = """
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17 CXXFLAGS = -std=c++17
device_memory.o: ../../../../../library/src/utility/device_memory.cpp device_memory.o: ../../library/src/utility/device_memory.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
host_tensor.o: ../../library/src/utility/host_tensor.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp
host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp main.o: main.cpp
hipcc -shared -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c main.cpp
obj_files = 256.o obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
%.o : %.cpp %.o : %.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $< hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
done: libtest.so
cp libtest.so /lib
all: test.so main: main.o device_memory.o host_tensor.o $(obj_files)
hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
test.so: $(obj_files) host_tensor.o device_memory.o libtest.so: $(obj_files) host_tensor.o device_memory.o
hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
all: done main.o
hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python -l test main.o -o example
clean: clean:
rm -f *.o test.so rm -f *.o libtest.so example Makefile
""" """
def emit(self,operation): def emit(self, instances):
obj_files = instances
values = { values = {
'temp' : "" 'obj_files' : str(instances)
} }
m_template = self.make_template m_template = self.make_template
cf = open("Makefile", 'w') cf = open("Makefile", 'w')
print(SubstituteTemplate(m_template, values))
cf.write(SubstituteTemplate(m_template, values)) cf.write(SubstituteTemplate(m_template, values))
cf.close() cf.close()
...@@ -65,11 +78,11 @@ clean: ...@@ -65,11 +78,11 @@ clean:
STDOUT = -2 STDOUT = -2
proc = subprocess.Popen( proc = subprocess.Popen(
["make"], ["make all"],
shell=True, shell=True,
env=os.environ.copy(), env=os.environ.copy(),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
) )
out, err = proc.communicate() out, err = proc.communicate()
\ No newline at end of file
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17 CXXFLAGS = -std=c++17
gemm: ex.o host_tensor.o device_memory.o
hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
device_memory.o: ../../../../../library/src/utility/device_memory.cpp device_memory.o: ../../../../../library/src/utility/device_memory.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
ex.o: main.o: main.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c ex.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c main.cpp
obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
%.o : %.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
done: libtest.so
cp libtest.so /lib
main: main.o device_memory.o host_tensor.o $(obj_files)
hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
libtest.so: $(obj_files) host_tensor.o device_memory.o
hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
all: done main.o
hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python/ait_impl/generation/ex/shared -l test main.o -o example
clean:
rm -f *.o libtest.so example
\ No newline at end of file
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
bool run_gemm_128_32_64_8_2(int argc, char* argv[]);
bool run_gemm_128_32_128_8_2(int argc, char* argv[]);
bool run_gemm_128_64_32_8_2(int argc, char* argv[]);
bool run_gemm_128_64_128_8_2(int argc, char* argv[]);
bool run_gemm_128_128_32_8_2(int argc, char* argv[]);
bool run_gemm_128_128_64_8_2(int argc, char* argv[]);
bool run_gemm_256_64_128_8_2(int argc, char* argv[]);
bool run_gemm_256_128_64_8_2(int argc, char* argv[]);
bool run_gemm_256_128_128_8_2(int argc, char* argv[]);
bool run_gemm_256_128_128_16_2(int argc, char* argv[]);
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment