Commit 47cc9b7e authored by Astha Rai's avatar Astha Rai
Browse files

added compilation of shared library and multiple instances for gemm, cleaned up code design

parent adbefd90
...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_256_128_128_16_2(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_256_128_128_16_2(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_128_128_16_2(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -35,21 +35,21 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -35,21 +35,21 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
256, 256,
128, 128,
128, 128,
16, 8,
2, 2,
4, 4,
4, 4,
1, 1,
S<8, 2>, S<8, 2>,
S<8, 2>, S<8, 2>,
S<2, 1, 4, 2>, S<1, 1, 4, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 1>, S<1, 1, 4, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 2>, S<1, 1, 4, 2>,
S<2, 1, 4, 2>, S<1, 1, 4, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_256_128_128_8_2(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_256_128_128_8_2(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_128_128_8_2(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -32,39 +32,39 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -32,39 +32,39 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::device::GemmSpecialization::Default, ck::tensor_operation::device::GemmSpecialization::Default,
256,
128, 128,
128, 64,
128, 8,
32, 2,
4,
2, 2,
32,
32,
1, 1,
S<8, 2>, S<8, 2>,
S<8, 2>, S<8, 2>,
S<2, 1, 4, 2>, S<1, 1, 4, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 1>, S<1, 1, 4, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 2>, S<1, 1, 4, 2>,
S<2, 1, 4, 2>, S<1, 1, 2, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 1>, S<1, 1, 2, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 2>, S<1, 1, 2, 2>,
S<1, 2, 3, 5, 5, 6>, S<0, 1, 2, 3, 4, 5>,
6, 5,
5>; 2>;
using ReferenceGemmInstance = ck::tensor_operation::host:: using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_256_128_64_8_2(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_256_128_64_8_2(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_128_64_8_2(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -33,23 +33,23 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -33,23 +33,23 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::device::GemmSpecialization::Default, ck::tensor_operation::device::GemmSpecialization::Default,
256, 256,
64,
128, 128,
128, 8,
16, 2,
2, 2,
4,
4, 4,
1, 1,
S<8, 2>, S<8, 2>,
S<8, 2>, S<8, 2>,
S<2, 1, 4, 2>, S<1, 1, 2, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 4, 1>, S<1, 1, 2, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<1, 1, 2, 2>,
S<1, 1, 4, 2>, S<1, 1, 4, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>, S<8, 1, 32, 1>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
S<0, 3, 1, 2>, S<0, 3, 1, 2>,
...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -64,7 +64,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_256_64_128_8_2(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -172,12 +172,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_256_64_128_8_2(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_256_64_128_8_2(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17 CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
device_memory.o: ../../../../../library/src/utility/device_memory.cpp CXXFLAGS = -std=c++17
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
device_memory.o: ../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
hipcc -shared -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
host_tensor.o: ../../library/src/utility/host_tensor.cpp
obj_files = 256.o hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp
%.o : %.cpp main.o: main.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $< hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c main.cpp
all: test.so obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
test.so: $(obj_files) host_tensor.o device_memory.o %.o : %.cpp
hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
clean: done: libtest.so
rm -f *.o test.so cp libtest.so /lib
main: main.o device_memory.o host_tensor.o $(obj_files)
hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
libtest.so: $(obj_files) host_tensor.o device_memory.o
hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
all: done main.o
hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python -l test main.o -o example
clean:
rm -f *.o libtest.so example Makefile
\ No newline at end of file
from dataclasses import dataclass
class DataType:
f16 = "ck::half_t"
class Layout:
ColumnMajor = "ck::tensor_layout::gemm::ColumnMajor"
RowMajor = "ck::tensor_layout::gemm::RowMajor"
class TensorOperation:
PassThrough = "ck::tensor_operation::element_wise::PassThrough"
@dataclass
class TensorDesc: #set up and import properly
element: DataType
layout: Layout
...@@ -16,9 +16,15 @@ from gemm_op import * ...@@ -16,9 +16,15 @@ from gemm_op import *
import user import user
from ck_types import * from ck_types import *
from gemm_ex import * from gemm_ex import *
from make_template import *
# holds multiple gemm instances # holds multiple gemm instances
op_collection = user.CreateGemmOperator() op_collection = user.CreateGemmOperator()
instances = []
for op in op_collection: for op in op_collection:
instances.append((str(op.tile_desc.block_size) + "_" + str(op.tile_desc.m_per_block) + "_" + str(op.tile_desc.n_per_block) + "_" + str(op.tile_desc.k_per_block) + "_" + str(op.tile_desc.k1) + ".o "))
x = EmitGemmInstance() x = EmitGemmInstance()
x.emit(op) x.emit(op)
\ No newline at end of file m = EmitMake()
m.emit(instances)
#print(str(instances))
...@@ -92,7 +92,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl< ...@@ -92,7 +92,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_${name}(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
using namespace ck::literals; using namespace ck::literals;
...@@ -200,18 +200,18 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -200,18 +200,18 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
return true; return true;
} }
bool run_gemm_example(int argc, char* argv[]) bool __attribute__((visibility("default"))) run_gemm_${name}(int argc, char* argv[])
{ {
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm_${name}(problem_size, config);
} }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
""" """
def emit(self,operation): def emit(self,operation):
values = { values = {
'name' : (str(operation.tile_desc.block_size) + "_" + str(operation.tile_desc.m_per_block) + "_" + str(operation.tile_desc.n_per_block)+ "_" + str(operation.tile_desc.k_per_block) + "_" + str(operation.tile_desc.k1)),
'type_a' : operation.A.element, 'type_a' : operation.A.element,
'type_b' : operation.B.element, 'type_b' : operation.B.element,
'type_c' : operation.C.element, 'type_c' : operation.C.element,
...@@ -252,31 +252,10 @@ int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } ...@@ -252,31 +252,10 @@ int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
'CTT_dst_scalar_per_vector' : str(operation.c_block_transfer.dst_scalar_per_vector), 'CTT_dst_scalar_per_vector' : str(operation.c_block_transfer.dst_scalar_per_vector),
} }
template = self.gemm_devop_template template = self.gemm_devop_template
name = str(operation.tile_desc.block_size) name = (str(operation.tile_desc.block_size) + "_" + str(operation.tile_desc.m_per_block) + "_" + str(operation.tile_desc.n_per_block)
+ "_" + str(operation.tile_desc.k_per_block) + "_" + str(operation.tile_desc.k1))
cf = open("%s.cpp" % name,'w') cf = open("%s.cpp" % name,'w')
print(SubstituteTemplate(template, values))
cf.write(SubstituteTemplate(template, values)) cf.write(SubstituteTemplate(template, values))
cf.close() cf.close()
# A = TensorDesc(DataType.f16, Layout.RowMajor)
# B = TensorDesc(DataType.f16, Layout.ColumnMajor)
# C = TensorDesc(DataType.f16, Layout.RowMajor)
# gemm = gemm_op.GemmOperation(
# A=A,
# B=B,
# C=C,
# a_elem_op=TensorOperation.PassThrough,
# b_elem_op=TensorOperation.PassThrough,
# epilogue_functor=TensorOperation.PassThrough,
# gemm_specialization=GemmType.GemmDefault,
# tile_desc=TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 1, [8,2], [8,2]),
# a_block_transfer=BlockTransferDesc(
# [2, 1, 4, 2], [8, 1, 32, 1], [0, 3, 1, 2], [0, 3, 1, 2],[1, 1, 4, 1], [0, 3, 1, 2], [1, 1, 4, 2]
# ),
# b_block_transfer=BlockTransferDesc(
# [2, 1, 4, 2], [8, 1, 32, 1], [0, 3, 1, 2], [0, 3, 1, 2], [1, 1, 4, 1], [0, 3, 1, 2], [1, 1, 4, 2]
# ),
# c_block_transfer=CBlockTransferDesc([0, 1, 2, 3, 4, 5], 5, 4),
# )
# a = EmitGemmInstance()
# a.emit(gemm)
#include "run.h"
int main(int argc, char* argv[])
{
//return !run_gemm_example(argc, argv);
run_gemm_128_32_128_8_2(argc, argv);
run_gemm_128_64_32_8_2(argc, argv);
run_gemm_128_64_128_8_2(argc, argv);
run_gemm_128_128_32_8_2(argc, argv);
run_gemm_128_128_64_8_2(argc, argv);
run_gemm_256_64_128_8_2(argc, argv);
run_gemm_256_128_64_8_2(argc, argv);
run_gemm_256_128_128_8_2(argc, argv);
run_gemm_256_128_128_16_2(argc, argv);
}
...@@ -26,38 +26,51 @@ def SubstituteTemplate(template, values): ...@@ -26,38 +26,51 @@ def SubstituteTemplate(template, values):
class EmitMake: class EmitMake:
def __init__(self): def __init__(self):
self.make_template = """ self.make_template = """
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17 CXXFLAGS = -std=c++17
device_memory.o: ../../../../../library/src/utility/device_memory.cpp device_memory.o: ../../library/src/utility/device_memory.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
host_tensor.o: ../../library/src/utility/host_tensor.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp
host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp main.o: main.cpp
hipcc -shared -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c main.cpp
obj_files = 256.o obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
%.o : %.cpp %.o : %.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $< hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
done: libtest.so
cp libtest.so /lib
all: test.so main: main.o device_memory.o host_tensor.o $(obj_files)
hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
test.so: $(obj_files) host_tensor.o device_memory.o libtest.so: $(obj_files) host_tensor.o device_memory.o
hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
all: done main.o
hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python -l test main.o -o example
clean: clean:
rm -f *.o test.so rm -f *.o libtest.so example Makefile
""" """
def emit(self,operation): def emit(self, instances):
obj_files = instances
values = { values = {
'temp' : "" 'obj_files' : str(instances)
} }
m_template = self.make_template m_template = self.make_template
cf = open("Makefile", 'w') cf = open("Makefile", 'w')
print(SubstituteTemplate(m_template, values))
cf.write(SubstituteTemplate(m_template, values)) cf.write(SubstituteTemplate(m_template, values))
cf.close() cf.close()
...@@ -65,11 +78,11 @@ clean: ...@@ -65,11 +78,11 @@ clean:
STDOUT = -2 STDOUT = -2
proc = subprocess.Popen( proc = subprocess.Popen(
["make"], ["make all"],
shell=True, shell=True,
env=os.environ.copy(), env=os.environ.copy(),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
) )
out, err = proc.communicate() out, err = proc.communicate()
\ No newline at end of file
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/ CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17 CXXFLAGS = -std=c++17
gemm: ex.o host_tensor.o device_memory.o
hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
device_memory.o: ../../../../../library/src/utility/device_memory.cpp device_memory.o: ../../../../../library/src/utility/device_memory.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp host_tensor.o: ../../../../../library/src/utility/host_tensor.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) $(CFLAGS) -c ../../../../../library/src/utility/host_tensor.cpp
ex.o: main.o: main.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c ex.cpp hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c main.cpp
obj_files = 256_128_128_8_2.o 256_128_128_16_2.o 128_32_128_8_2.o 128_64_32_8_2.o 128_64_128_8_2.o 128_128_32_8_2.o 128_128_64_8_2.o 256_64_128_8_2.o 256_128_64_8_2.o
%.o : %.cpp
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c $<
done: libtest.so
cp libtest.so /lib
main: main.o device_memory.o host_tensor.o $(obj_files)
hipcc $(CXXFLAGS) $(CFLAGS) main.o host_tensor.o device_memory.o $(obj_files) -o main
libtest.so: $(obj_files) host_tensor.o device_memory.o
hipcc -shared $(CXXFLAGS) $(CFLAGS) -o $@ $(obj_files) host_tensor.o device_memory.o
all: done main.o
hipcc $(CXXFLAGS) $(CFLAGS) -L/root/workspace/composable_kernel/python/ait_impl/generation/ex/shared -l test main.o -o example
clean:
rm -f *.o libtest.so example
\ No newline at end of file
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
bool run_gemm_128_32_64_8_2(int argc, char* argv[]);
bool run_gemm_128_32_128_8_2(int argc, char* argv[]);
bool run_gemm_128_64_32_8_2(int argc, char* argv[]);
bool run_gemm_128_64_128_8_2(int argc, char* argv[]);
bool run_gemm_128_128_32_8_2(int argc, char* argv[]);
bool run_gemm_128_128_64_8_2(int argc, char* argv[]);
bool run_gemm_256_64_128_8_2(int argc, char* argv[]);
bool run_gemm_256_128_64_8_2(int argc, char* argv[]);
bool run_gemm_256_128_128_8_2(int argc, char* argv[]);
bool run_gemm_256_128_128_16_2(int argc, char* argv[]);
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment