"...git@developer.sourcefind.cn:cnjsdfcy/simbricks.git" did not exist on "d5b0e45bc51317fff3778f548d1aefcd93ee916f"
Commit 47cc9b7e authored by Astha Rai's avatar Astha Rai
Browse files

added compilation of shared library and multiple instances for gemm, cleaned up code design

parent adbefd90
# CK Python API
This API uses Python to generate instances of operations present in CK, compiles them into a shared library, and an executable to run the instances.
There are 2 directories: shared and normal. The normal directory contains one instance that will compile into an excutable to be run, while the shared directory
generates multiple instances and compiles them into a shared library.
## Normal
## Shared
gemm: xx.o
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17
xx.o:
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c xx.cpp
CC = {{cc}}
CFLAGS = {{CFLAGS}}
fPIC_flag = {{fPIC}}
obj_files = {{obj_files}}
%.obj : %.{{cpp}}
{{cfile_cmd}}
%.obj : %.bin
{{bfile_cmd}}
.PHONY: all clean clean_constants
all: {{target}}
{{target}}: $(obj_files)
$(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)
clean:
rm -f *.obj {{target}} test.so
clean_constants:
rm -f constants.bin
\ No newline at end of file
import enum
import os.path
import shutil
import functools
import operator
import collections
import subprocess
import re
def SubstituteTemplate(template, values):
text = template
changed = True
while changed:
changed = False
for key, value in values.items():
regex = "\\$\\{%s\\}" % key
newtext = re.sub(regex, value, text)
if newtext != text:
changed = True
text = newtext
return text
class EmitGemmInstance:
def __init__(self):
self.make_template = """
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17
gemm: ex.o host_tensor.o device_memory.o
hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
device_memory.o: ../../../../library/src/utility/device_memory.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
ex.o:
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c ex.cpp
"""
self.gemm_devop_template = """
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
${type_a},
${type_b},
${type_c},
${type_acc},
${layout_a},
${layout_b},
${layout_c},
${elementwise_op_a},
${elementwise_op_b},
${elementwise_op_c},
${Gemm_spec},
${block_size},
${mperblock},
${nperblock},
${k0perblock},
${k1},
${m1perthread},
${n1perthread},
${kperthread},
${m1n1_thcluster_m1xs},
${m1n1_thcluster_n1xs},
${ABT_thread_slice_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_arrange_order},
${ABT_src_access_order},
${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
${ABT_src_vec_tensor_cont_dim_order},
${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
${BBT_thread_slice_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_arrange_order},
${BBT_src_access_order},
${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
${BBT_src_vec_tensor_cont_dim_order},
${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
${CTT_src_dst_access_order},
${CTT_src_dst_vec_dim},
${CTT_dst_scalar_per_vector}>;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
}
Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = ${elementwise_op_a}{};
auto b_element_op = ${elementwise_op_b}{};
auto c_element_op = ${elementwise_op_c}{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
"""
def emit(self):
values = {
'type_a' : 'ck::half_t',
'type_b' : 'ck::half_t',
'type_c' : 'ck::half_t',
'type_acc' : 'float',
'layout_a' : 'ck::tensor_layout::gemm::ColumnMajor',
'layout_b' : 'ck::tensor_layout::gemm::RowMajor',
'layout_c' : 'ck::tensor_layout::gemm::RowMajor',
'elementwise_op_a' : 'ck::tensor_operation::element_wise::PassThrough',
'elementwise_op_b' : 'ck::tensor_operation::element_wise::PassThrough',
'elementwise_op_c' : 'ck::tensor_operation::element_wise::PassThrough',
'Gemm_spec' : 'ck::tensor_operation::device::GemmSpecialization::Default',
'block_size' : '256',
'mperblock' : '128',
'nperblock' : '128',
'k0perblock' : '16',
'k1' : '2',
'm1perthread' : '4',
'n1perthread' : '4',
'kperthread' : '1',
'm1n1_thcluster_m1xs' : 'S<8, 2>',
'm1n1_thcluster_n1xs' : 'S<8, 2>',
'ABT_thread_slice_lengths_K0_M0_M1_K1' : 'S<2, 1, 4, 2>',
'ABT_thread_cluster_lengths_K0_M0_M1_K1' : 'S<8, 1, 32, 1>',
'ABT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
'ABT_src_access_order' : 'S<0, 3, 1, 2>',
'ABT_src_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 1>',
'ABT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1' : 'S<1, 1, 4, 2>',
'BBT_thread_slice_lengths_K0_N0_N1_K1' : 'S<2, 1, 4, 2>',
'BBT_thread_cluster_lengths_K0_N0_N1_K1' : 'S<8, 1, 32, 1>',
'BBT_thread_cluster_arrange_order' : 'S<0, 3, 1, 2>',
'BBT_src_access_order' : 'S<0, 3, 1, 2>',
'BBT_src_vec_tensor_lengths_K0_N0_N1_K1' : 'S<1, 1, 4, 1>',
'BBT_src_vec_tensor_cont_dim_order' : 'S<0, 3, 1, 2>',
'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1': 'S<1, 1, 4, 2>',
'CTT_src_dst_access_order' : 'S<0, 1, 2, 3, 4, 5>',
'CTT_src_dst_vec_dim' : '5',
'CTT_dst_scalar_per_vector' : '4'
}
template = self.gemm_devop_template
cf = open("ex.cpp", 'w')
print(SubstituteTemplate(template, values))
cf.write(SubstituteTemplate(template, values))
cf.close()
m_template = self.make_template
cf = open("Makefile", 'w')
print(SubstituteTemplate(m_template, values))
cf.write(SubstituteTemplate(m_template, values))
cf.close()
PIPE = -1
STDOUT = -2
proc = subprocess.Popen(
["make"],
shell=True,
env=os.environ.copy(),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
out, err = proc.communicate()
a = EmitGemmInstance()
a.emit()
import enum
import os.path
import shutil
import functools
import operator
import collections
import re
def SubstituteTemplate(template, values):
text = template
changed = True
while changed:
changed = False
for key, value in values.items():
regex = "\\$\\{%s\\}" % key
newtext = re.sub(regex, value, text)
if newtext != text:
changed = True
text = newtext
return text
class EmitGemmInstance:
def __init__(self):
self.gemm_devop_template = """
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
<ADataType,
BDataType,
CDataType,
AccDataType,
ALayout,
BLayout,
CLayout,
AElementOp,
BElementOp,
CElementOp,
GemmDefault,
256,
128,
128,
16,
2,
4,
4,
1,
S<8, 2>,
S<8, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<0, 1, 2, 3, 4, 5>,
5,
4>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
"""
def emit(self):
values = {
'type_a' : 'ck::half_t',
}
template = self.gemm_devop_template
cf = open("xx.cpp", 'w')
print(SubstituteTemplate(template, values))
cf.write(SubstituteTemplate(template, values))
cf.close()
a = EmitGemmInstance()
a.emit()
CC = /opt/rocm/bin/hipcc
CK_PATH=/dockerx/composable_kernel/
CFLAGS = -O3 -std=c++17 -DCK_AMD_GPU_GFX90A --offload-arch=gfx90a -I"${CK_PATH}/include" -I"${CK_PATH}/library/include" -I"${CK_PATH}/profiler/include"
OBJS = ex.o host_tensor.o device_memory.o
all: $(OBJS)
$(CC) $(CFLAGS) $(OBJS) -o ex
device_memory.o: ../../library/src/utility/device_memory.cpp
$(CC) $(CFLAGS) -c ../../library/src/utility/device_memory.cpp
host_tensor.o: ../../library/src/utility/host_tensor.cpp
$(CC) $(CFLAGS) -c ../../library/src/utility/host_tensor.cpp
\ No newline at end of file
import enum
import os.path
import shutil
import functools
import operator
import collections
import re
def SubstituteTemplate(template, values):
text = template
changed = True
while changed:
changed = False
for key, value in values.items():
regex = "\\$\\{%s\\}" % key
newtext = re.sub(regex, value, text)
if newtext != text:
changed = True
text = newtext
return text
class EmitGemmInstance:
def __init__(self):
self.gemm_devop_template = """
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
<ADataType,
BDataType,
CDataType,
AccDataType,
ALayout,
BLayout,
CLayout,
AElementOp,
BElementOp,
CElementOp,
GemmDefault,
256,
128,
128,
16,
2,
4,
4,
1,
S<8, 2>,
S<8, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<0, 1, 2, 3, 4, 5>,
5,
4>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
"""
def emit(self):
values = {
'type_a' : 'ck::half_t',
}
template = self.gemm_devop_template
cf = open("xx.cpp", 'w')
print(SubstituteTemplate(template, values))
cf.write(SubstituteTemplate(template, values))
cf.close()
a = EmitGemmInstance()
a.emit()
#take in input for gemm from user, send it to example template
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
// #include <half.hpp>
#include <random>
#include <rocrand/rocrand.h>
#include "logging.h"
#include "include/ck/utility/print.hpp"
#include "library/include/ck/library/utility/device_memory.hpp"
#include "library/include/ck/library/utility/host_tensor.hpp"
#include "library/include/ck/library/utility/host_tensor_generator.hpp"
#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
using gemm_2_hhh_TTT_256_64_128_32_8_2_32_32_1_2_PT = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle<
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::device::GemmSpecialization::MNKPadding,
1,
256, // block_size
64, // m_per_block
128, // n_per_block
32, // k_per_block
8, // ak1
2, // bk1
32, // m_per_xdl
32, // n_per_xdl
1, // m_xdl_per_wave
2, // n_xdl_per_wave
ck::Sequence<4,64,1>, // thread_cluster_length
ck::Sequence<1,0,2>, // thread_cluster_arrange_order
ck::Sequence<1,0,2>, // src_access_order
2, // src_vector_dim
8, // src_scalar_per_vector
8, // dst_scalar_per_vector
1, // add_extra_dim
ck::Sequence<8,32,1>, // thread_cluster_length
ck::Sequence<0,2,1>, // thread_cluster_arrange_order
ck::Sequence<0,2,1>, // src_access_order
1, // src_vector_dim
4, // src_scalar_per_vector
2, // dst_scalar_per_vector
0, // add_extra_dim
1, // m_xdl_per_wave
1, // n_xdl_per_wave
ck::Sequence<1,32,1,8>, // m_n_block_wave_per_xdl
8 // scalar_per_vector
>;
using fe7d3cbb34ba481ca532c1fecec7ec7cc5fbb35d0 = gemm_2_hhh_TTT_256_64_128_32_8_2_32_32_1_2_PT;
void gemm_rrr_3(
void * in_ptr,
void * weight_ptr,
void * out_ptr,
int64_t* a_dim0,
int64_t* a_dim1,
int64_t* b_dim0,
int64_t* b_dim1,
int64_t* c_dim0,
int64_t* c_dim1,
hipStream_t stream
) {
ck::index_t M = (*a_dim0);
ck::index_t N = (*b_dim1);
ck::index_t K = (*a_dim1);
int64_t offset_a = 0;
int64_t offset_b = 0;
int64_t offset_c = 0;
ck::index_t stride_a = *a_dim1;
ck::index_t stride_b = *b_dim1;
ck::index_t stride_c = *c_dim1;
if (M == 256 && N == 32 && K == 128) {
auto op = fe7d3cbb34ba481ca532c1fecec7ec7cc5fbb35d0{};
auto invoker = op.MakeInvoker();
auto argument = op.MakeArgument(
static_cast<ck::half_t *>(in_ptr) + offset_a,
static_cast<ck::half_t *>(weight_ptr) + offset_b,
static_cast<ck::half_t *>(out_ptr) + offset_c,
M,
N,
K,
stride_a,
stride_b,
stride_c,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{}
);
if(!op.IsSupportedArgument(argument)) {
LOG(FATAL) << "wrong! " << op.GetTypeString() << " with the specified compilation parameters does not support this Gemm problem.";
}
invoker.Run(argument, StreamConfig{stream, false});
return;
}
LOG(FATAL) << "Unsupported workload for this gemm specialization.";
}
\ No newline at end of file
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
// #include <half.hpp>
#include <random>
#include <rocrand/rocrand.h>
#include "logging.h"
#include "include/ck/utility/print.hpp"
#include "library/include/ck/library/utility/device_memory.hpp"
#include "library/include/ck/library/utility/host_tensor.hpp"
#include "library/include/ck/library/utility/host_tensor_generator.hpp"
#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
using gemm_2_hhh_TTT_256_64_128_32_8_2_32_32_1_2_PT = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle<
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::device::GemmSpecialization::MNKPadding,
1,
256, // block_size
64, // m_per_block
128, // n_per_block
32, // k_per_block
8, // ak1
2, // bk1
32, // m_per_xdl
32, // n_per_xdl
1, // m_xdl_per_wave
2, // n_xdl_per_wave
ck::Sequence<4,64,1>, // thread_cluster_length
ck::Sequence<1,0,2>, // thread_cluster_arrange_order
ck::Sequence<1,0,2>, // src_access_order
2, // src_vector_dim
8, // src_scalar_per_vector
8, // dst_scalar_per_vector
1, // add_extra_dim
ck::Sequence<8,32,1>, // thread_cluster_length
ck::Sequence<0,2,1>, // thread_cluster_arrange_order
ck::Sequence<0,2,1>, // src_access_order
1, // src_vector_dim
4, // src_scalar_per_vector
2, // dst_scalar_per_vector
0, // add_extra_dim
1, // m_xdl_per_wave
1, // n_xdl_per_wave
ck::Sequence<1,32,1,8>, // m_n_block_wave_per_xdl
8 // scalar_per_vector
>;
using fe7d3cbb34ba481ca532c1fecec7ec7cc5fbb35d0 = gemm_2_hhh_TTT_256_64_128_32_8_2_32_32_1_2_PT;
void gemm_rrr_3(
void * in_ptr,
void * weight_ptr,
void * out_ptr,
int64_t* a_dim0,
int64_t* a_dim1,
int64_t* b_dim0,
int64_t* b_dim1,
int64_t* c_dim0,
int64_t* c_dim1,
hipStream_t stream
) {
ck::index_t M = (*a_dim0);
ck::index_t N = (*b_dim1);
ck::index_t K = (*a_dim1);
int64_t offset_a = 0;
int64_t offset_b = 0;
int64_t offset_c = 0;
ck::index_t stride_a = *a_dim1;
ck::index_t stride_b = *b_dim1;
ck::index_t stride_c = *c_dim1;
if (M == 256 && N == 32 && K == 128) {
auto op = fe7d3cbb34ba481ca532c1fecec7ec7cc5fbb35d0{};
auto invoker = op.MakeInvoker();
auto argument = op.MakeArgument(
static_cast<ck::half_t *>(in_ptr) + offset_a,
static_cast<ck::half_t *>(weight_ptr) + offset_b,
static_cast<ck::half_t *>(out_ptr) + offset_c,
M,
N,
K,
stride_a,
stride_b,
stride_c,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{}
);
if(!op.IsSupportedArgument(argument)) {
LOG(FATAL) << "wrong! " << op.GetTypeString() << " with the specified compilation parameters does not support this Gemm problem.";
}
invoker.Run(argument, StreamConfig{stream, false});
return;
}
LOG(FATAL) << "Unsupported workload for this gemm specialization.";
}
\ No newline at end of file
size_t GLOBAL_WORKSPACE_SIZE = 0;
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <random>
#include <rocrand/rocrand.h>
#include "include/ck/utility/print.hpp"
#include "library/include/ck/library/utility/device_memory.hpp"
#include "library/include/ck/library/utility/host_tensor.hpp"
#include "library/include/ck/library/utility/host_tensor_generator.hpp"
#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/reduction_operator.hpp"
#include "include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
using layernorm_rank2_256_1_256_1_8_1_8_1_8_1_8_8 = ck::tensor_operation::device::DeviceLayernormImpl<
ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
2,
1,
256, // block_size
1, // m_cluster_size
256, // k_cluster_size
1, // m_slice_size
8, // k_slice_size
1, // in_src_dim
8, // in_src_size
1, // gamma_src_dim
8, // gamma_src_size
1, // beta_src_dim
8, // beta_src_size
8 // out_dst_size
>;
using DeviceInstance = layernorm_rank2_256_1_256_1_8_1_8_1_8_1_8_8;
void layernorm_0(void* input,
void* gamma,
void* beta,
void* output,
int64_t* in_0,
int64_t* in_1,
hipStream_t stream)
{
int M = *in_0;
int N = *in_1;
std::vector<ck::index_t> i_inStrides;
i_inStrides.push_back(N);
i_inStrides.push_back(1);
auto device_instance = DeviceInstance{};
auto argument_ptr = device_instance.MakeArgumentPointer(
{M, N},
i_inStrides,
std::vector<ck::index_t>{0, 1},
std::vector<ck::index_t>{0, 1},
i_inStrides,
{1},
1e-05,
static_cast<ck::half_t *>(input),
static_cast<ck::half_t *>(gamma),
static_cast<ck::half_t *>(beta),
static_cast<ck::half_t *>(output),
ck::tensor_operation::element_wise::PassThrough{}
);
if(!device_instance.IsSupportedArgument(argument_ptr.get()))
{
throw std::runtime_error(
"wrong! device_layernorm with the specified compilation parameters does "
"not support this Softmax problem");
};
std::string instance_name = device_instance.GetTypeString();
auto invoker_ptr = device_instance.MakeInvokerPointer();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
return;
}
struct ProfilerMemoryPool {
ProfilerMemoryPool() {
std::random_device rd;
gen = std::mt19937(rd());
uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
offsets.reserve(512);
strides.reserve(512);
copies.reserve(512);
ptrs.reserve(512);
}
~ProfilerMemoryPool() {
for(int i = 0; i < ptrs.size(); i++){
hipFree(ptrs[i]);
}
}
template <typename DType>
DType* AllocateGaussianTensor(int64_t size) {
size_t length = size * sizeof(DType);
DType *d_x;
hipMalloc(&d_x, length);
float mean = 0.0f;
float stddev = 1.0f;
uint64_t seed = uniform_dist(gen);
rocrand_set_seed(generator, seed);
rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
return d_x;
}
ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
return reinterpret_cast<ck::half_t*>(
AllocateGaussianTensor<ck::half_t>(size));
}
int AllocateHalfTensor(int64_t size, int64_t copy) {
offsets.push_back(0);
strides.push_back(size);
copies.push_back(copy);
auto ptr = AllocateHalfGaussianTensor(size * copy);
ptrs.push_back(reinterpret_cast<void*>(ptr));
return ptrs.size() - 1;
}
ck::half_t* RequestHalfTensorByIdx(int idx) {
auto copy = copies.at(idx);
auto offset = offsets.at(idx);
auto stride = strides.at(idx);
ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
ptr += offset;
offset += stride;
if (offset == copy * stride) {
offset = 0;
}
offsets[idx] = offset;
return ptr;
}
std::vector<int64_t> offsets;
std::vector<int64_t> strides;
std::vector<int64_t> copies;
std::vector<void*> ptrs;
std::mt19937 gen;
std::uniform_int_distribution<int64_t> uniform_dist;
rocrand_generator generator;
};
// hack for DeviceMem linking error
// TODO fix this by making CK a header-only lib
// <<< hack begin
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p) const
{
hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p) const
{
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
struct KernelTimerImpl
{
KernelTimerImpl() {
hipGetErrorString(hipEventCreate(&mStart));
hipGetErrorString(hipEventCreate(&mEnd));
}
~KernelTimerImpl() {
hipGetErrorString(hipEventDestroy(mStart));
hipGetErrorString(hipEventDestroy(mEnd));
}
void Start() {
hipGetErrorString(hipDeviceSynchronize());
hipGetErrorString(hipEventRecord(mStart, nullptr));
}
void End() {
hipGetErrorString(hipEventRecord(mEnd, nullptr));
hipGetErrorString(hipEventSynchronize(mEnd));
}
float GetElapsedTime() const {
float time;
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
return time;
}
hipEvent_t mStart, mEnd;
};
// >>> hack end
int main(int argc, char** argv) {
const int64_t in_0 = std::stoi(argv[1]);
const int64_t in_1 = std::stoi(argv[2]);
auto memory_pool = std::make_unique<ProfilerMemoryPool>();
hipStream_t stream = nullptr;
int64_t ptr_sz = in_0 * in_1;
int64_t norm_dim = in_1;
// TODO: special pool size for 8M L2 cache
// need to tune it for other devices
int64_t mem_pool_sz = std::max(2, std::min(64, int((1 << 23) / ptr_sz)));
memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz); // in: index 0
memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz); // out: index 1
memory_pool->AllocateHalfTensor(norm_dim, mem_pool_sz); // gamma: index 2
memory_pool->AllocateHalfTensor(norm_dim, mem_pool_sz); // beta: index 3
// warmup
for(int i = 0; i < 3; ++i) {
layernorm_0(
(void *) memory_pool->RequestHalfTensorByIdx(0),
(void *) memory_pool->RequestHalfTensorByIdx(2),
(void *) memory_pool->RequestHalfTensorByIdx(3),
(void *) memory_pool->RequestHalfTensorByIdx(1),
const_cast<int64_t *>(&in_0),
const_cast<int64_t *>(&in_1),
stream
);
}
// run
KernelTimerImpl timer;
timer.Start();
for(int i = 0; i < 5; ++i) {
layernorm_0(
(void *) memory_pool->RequestHalfTensorByIdx(0),
(void *) memory_pool->RequestHalfTensorByIdx(2),
(void *) memory_pool->RequestHalfTensorByIdx(3),
(void *) memory_pool->RequestHalfTensorByIdx(1),
const_cast<int64_t *>(&in_0),
const_cast<int64_t *>(&in_1),
stream
);
}
timer.End();
std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
}
\ No newline at end of file
#pragma once
#include "logging.h"
#include "device_functions-generated.h"
#include "model_interface.h"
#include "raii_wrapper.h"
#include "macros.h"
#include <algorithm>
#include <deque>
#include <string>
#include <unordered_map>
#include <math.h>
void gemm_rrr_3(
void *,
void *,
void *,
int64_t*,
int64_t*,
int64_t*,
int64_t*,
int64_t*,
int64_t*,
hipStream_t
);
#define CHECK_VECTOR_ACCESS(vector, idx) \
if (idx >= vector.size()) { \
throw std::out_of_range( \
"[__func__]: index out of range, " #vector ".size()=" + \
std::to_string(vector.size()) + ", got " + std::to_string(idx)); \
}
namespace ait {
namespace {
void DeviceCheckLastError(const char* file, int line) {
auto device_error = GetLastError();
if (device_error != GetDeviceSuccess()) {
std::string msg = std::string("Got error: ") + GetLastErrorString() +
" enum: " + std::to_string(device_error) +
" at " + file + ": " + std::to_string(line);
LOG(ERROR) << msg;
throw std::runtime_error(msg);
}
}
thread_local bool target_has_graph_mode = false;
} // namespace
// Model is the class that actually performs inference. It owns memory for
// intermediate tensors and dynamic dimensions. Constants are owned by
// the model's owning container object, and input/output memory is owned
// by the user.
// Once an inference run has started, it is not safe to re-use the Model
// until the run has finished!
class Model {
public:
Model(
size_t blob_size,
size_t workspace_size,
size_t num_inputs,
size_t num_outputs,
size_t num_unbound_constants,
uint8_t* constants,
AITemplateAllocator& allocator)
: blob_(RAII_DeviceMalloc(blob_size, allocator)),
workspace_(RAII_DeviceMalloc(workspace_size, allocator)),
params_(num_inputs + num_outputs + num_unbound_constants),
num_inputs_(num_inputs),
num_outputs_(num_outputs),
constants_(constants) {
dmlc::InitLogging("aitemplate"); // TODO(xxx): render network name
LOG(INFO) << "Init AITemplate Runtime.";
global_workspace_ = static_cast<uint8_t*>(workspace_.get()) + 0;
unique_workspace_ = static_cast<uint8_t*>(workspace_.get());
DEVICE_CHECK(GetDevice(&device_idx_))
DEVICE_CHECK(CreateEvent(&run_finished_));
#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
DEVICE_CHECK(cudaDeviceGetAttribute(
&max_smem_size_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx_));
#endif
DEVICE_CHECK(GetDeviceProperties(&device_properties_, device_idx_));
DEVICE_CHECK(StreamCreate(&graph_capture_stream_, /*non_blocking=*/true));
InitConstants(constants_);
auto* blob_ptr = static_cast<uint8_t*>(blob_.get());
}
~Model() {
if (run_finished_ != nullptr) {
DestroyEvent(run_finished_);
}
if (graph_capture_stream_ != nullptr) {
StreamDestroy(graph_capture_stream_);
}
if (graph_exec_ != nullptr) {
GraphExecDestroy(graph_exec_);
}
}
Model(Model&& other) {
run_finished_ = other.run_finished_;
graph_exec_ = other.graph_exec_;
graph_capture_stream_ = other.graph_capture_stream_;
other.run_finished_ = nullptr;
other.graph_exec_ = nullptr;
other.graph_capture_stream_ = nullptr;
constants_ = other.constants_;
num_inputs_ = other.num_inputs_;
global_workspace_ = other.global_workspace_;
unique_workspace_ = other.unique_workspace_;
workspace_ = std::move(other.workspace_);
params_ = std::move(other.params_);
constant_name_to_ptr_ = std::move(other.constant_name_to_ptr_);
// Re-wire the pointers in the above 2 structures.
InitConstants(constants_);
}
Model& operator=(Model&&) = delete;
Model(const Model&) = delete;
Model& operator=(const Model&) = delete;
void SetUpInputsOutputs() {
input_0 = static_cast<decltype(input_0)>(params_[0].ptr);
if (input_0 == nullptr) {
throw std::runtime_error("Constant input_0 was not set! Set the value with set_constant.");
}
input_1 = static_cast<decltype(input_1)>(params_[1].ptr);
if (input_1 == nullptr) {
throw std::runtime_error("Constant input_1 was not set! Set the value with set_constant.");
}
output_0 = static_cast<decltype(output_0)>(params_[2].ptr);
if (output_0 == nullptr) {
throw std::runtime_error("Constant output_0 was not set! Set the value with set_constant.");
}
}
void DeviceToDeviceCopies(StreamType stream) {
}
void Run(StreamType stream, bool graph_mode) {
SetUpInputsOutputs();
if (target_has_graph_mode && graph_mode) {
RunAsGraph(stream);
} else {
RunImpl(stream);
}
DEVICE_CHECK(EventRecord(run_finished_, stream));
}
void RunImpl(StreamType stream) {
gemm_rrr_3(
input_0,
input_1,
output_0,
&input_0_dim_0,
&input_0_dim_1,
&input_1_dim_0,
&input_1_dim_1,
&input_0_dim_0,
&input_1_dim_1,
stream
);
DeviceCheckLastError(__FILE__, __LINE__);
DeviceToDeviceCopies(stream);
}
bool IsPending() {
auto query = QueryEvent(run_finished_);
if (query == GetDeviceNotReady()) {
return true;
}
if (query != GetDeviceSuccess()) {
LOG(WARNING) << "Pending model run did not finish successfully. Error: "
<< GetErrorString(query);
}
return false;
}
void WaitForCompletion() {
DEVICE_CHECK(EventSynchronize(run_finished_));
}
size_t NumInputs() const {
return num_inputs_;
}
size_t NumOutputs() const {
return num_outputs_;
}
void SetParam(const void* src, size_t param_idx) {
CHECK_VECTOR_ACCESS(params_, param_idx)
// const_cast is not ideal here, but it is unfortunately
// necessary:
// 1) We store outputs and inputs in the same vector,
// and outputs cannot be const.
// 2) Most of the codegen is not const-correct (most ops
// require non-const pointers). So even if we put const
// pointers into params, a const_cast would be required
// somewhere else.
params_[param_idx].ptr = const_cast<void*>(src);
}
void SetInput(const void* src, const AITemplateParamShape& shape, size_t idx) {
SetInputShape(shape, idx);
SetParam(src, idx);
}
void SetOutput(void* src, size_t idx) {
SetParam(src, idx + num_inputs_);
}
// Write the (possibly dynamic) output shape to the given pointer.
// Note that this should be called _after_ the shape inference in
// Run() is finished. output_shape_out should be able to store
// at least GetOutputMaximumShape(idx).size values.
void GetOutputShape(size_t idx, int64_t* output_shape_out) {
const auto param_idx = idx + num_inputs_;
CHECK_VECTOR_ACCESS(params_, param_idx);
const auto& shape_ptrs = params_[param_idx].shape_ptrs;
for (size_t i = 0; i < shape_ptrs.size(); ++i) {
output_shape_out[i] = shape_ptrs[i].GetValue();
}
}
void SetConstant(const char* name, const void* src) {
auto it = constant_name_to_ptr_.find(name);
if (it == constant_name_to_ptr_.end()) {
throw std::out_of_range(std::string("Could not find constant ") + name);
}
const void** ptr = it->second;
*ptr = src;
}
private:
void InitConstants(uint8_t* constants) {
params_[0].shape_ptrs = {ParamDim(256, 256, &input_0_dim_0), ParamDim(128, 128, &input_0_dim_1)};
params_[1].shape_ptrs = {ParamDim(128, 128, &input_1_dim_0), ParamDim(32, 32, &input_1_dim_1)};
params_[2].shape_ptrs = {ParamDim(256, 256, &input_0_dim_0), ParamDim(32, 32, &input_1_dim_1)};
}
void SetInputShape(const AITemplateParamShape& shape, size_t idx) {
auto& param = params_[idx];
if (shape.size != param.shape_ptrs.size()) {
throw std::runtime_error(
"[SetInputShape] Got wrong param shape for input " + std::to_string(idx) +
"; expected " + std::to_string(param.shape_ptrs.size()) + ", got " +
std::to_string(shape.size));
}
for (size_t i = 0; i < param.shape_ptrs.size(); ++i) {
param.shape_ptrs[i].SetValue(shape.shape_data[i]);
}
}
DeviceError EndCapture(GraphType* graph_ptr) {
auto err = StreamEndCapture(graph_capture_stream_, graph_ptr);
if (err != GetDeviceSuccess()) {
// If we can't take the stream out of capture mode, something is probably
// wrong with CUDA graph for this model (e.g. there might have been an
// illegal capture mode operation). Disable graph mode to avoid such issues
// in future iterations.
target_has_graph_mode = false;
LOG(WARNING) << "Graph capture failed to end. Disabling graph mode.";
return err;
}
return GetDeviceSuccess();
}
void RunAsGraph(StreamType stream) {
DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
try {
RunImpl(graph_capture_stream_);
} catch (...) {
GraphType graph;
// No need to DEVICE_CHECK here, we want to see the original exception.
EndCapture(&graph);
if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
LOG(WARNING) << "Graph destruction failed while handling exception! Memory will be leaked.";
}
throw;
}
// The following function ends the capture and creates a graph
// inside a unique_ptr that cleans up it when it goes out of scope.
// Note that it throws an exception if EndCapture fails.
auto graph = RAII_EndCaptureAndCreateGraph(
[this](GraphType* graph_ptr){ return EndCapture(graph_ptr); }
);
if (graph_exec_ == nullptr) {
DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
} else if (GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
// Consume the last cuda error, which may affect the next GraphExecLaunch
// call.
GetLastError();
DEVICE_CHECK(GraphExecDestroy(graph_exec_));
DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
}
DEVICE_CHECK(GraphExecLaunch(graph_exec_, stream));
}
int device_idx_;
int max_smem_size_{0};
DevicePropertyType device_properties_;
// This event tracks when the inference is finished
// so that this Model may be reclaimed by its owning
// ModelContainer.
EventType run_finished_;
// A blob of memory used for storing intermediate tensors.
GPUPtr blob_;
// Memory for constants that were folded into the *.so. Unowned by Model,
// owned by ModelContainer.
// TODO: make this const. It can't be const right now because we derive
// tensor pointers from it, and no tensor pointers are const.
uint8_t* constants_;
size_t num_inputs_;
size_t num_outputs_;
// The workspace blob is used as scratch memory. See
// _generate_workspace in memory planning for more information.
GPUPtr workspace_;
uint8_t* global_workspace_{nullptr};
uint8_t* unique_workspace_{nullptr};
class ParamDim {
public:
ParamDim(int64_t lower_bound, int64_t upper_bound, int64_t* value) :
lower_bound_(lower_bound),
upper_bound_(upper_bound),
value_(value) {}
void SetValue(int64_t new_value) {
if (new_value < lower_bound_ || new_value > upper_bound_) {
throw std::out_of_range(
"[SetValue] Dimension got value out of bounds; expected value to be in [" +
std::to_string(lower_bound_) + ", " + std::to_string(upper_bound_) + "], but got " +
std::to_string(new_value)
);
}
*value_ = new_value;
}
int64_t GetValue() const {
return *value_;
}
private:
int64_t lower_bound_;
int64_t upper_bound_;
int64_t* value_;
};
struct ParamInfo {
void* ptr = nullptr;
// TODO add offset
const char* name;
std::vector<ParamDim> shape_ptrs;
};
// Contains info for all tensors marked as inputs
// or outputs. The first num_inputs elements are the inputs.
// Constants are not included.
std::vector<ParamInfo> params_;
GraphExecType graph_exec_ = nullptr;
StreamType graph_capture_stream_;
std::unordered_map<std::string, const void**> constant_name_to_ptr_;
void * input_0 {nullptr};
void * input_1 {nullptr};
void * output_0 {nullptr};
int64_t input_0_dim_0 { 256 };
int64_t input_0_dim_1 { 128 };
int64_t input_1_dim_0 { 128 };
int64_t input_1_dim_1 { 32 };
};
} // namespace ait
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment