"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "ecdfe960921032c1aae6dc2c4a3e0ad1b8bba559"
Commit e20ed766 authored by carlushuang's avatar carlushuang
Browse files

format

parent 1e95a6e2
...@@ -153,13 +153,13 @@ bool test_cast(ck_tile::ArgParser args) ...@@ -153,13 +153,13 @@ bool test_cast(ck_tile::ArgParser args)
t_.input_type = input_prec; t_.input_type = input_prec;
t_.output_type = output_prec; t_.output_type = output_prec;
t_.op = std::string("cast"); t_.op = std::string("cast");
t_.num_cu = [&]() { t_.num_cu = [&]() {
hipDeviceProp_t dev_prop; hipDeviceProp_t dev_prop;
hipDevice_t dev; hipDevice_t dev;
HIP_CHECK_ERROR(hipGetDevice(&dev)); HIP_CHECK_ERROR(hipGetDevice(&dev));
HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev)); HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
return dev_prop.multiProcessorCount; return dev_prop.multiProcessorCount;
}(); }();
return t_; return t_;
}(); }();
...@@ -188,7 +188,8 @@ bool test_cast(ck_tile::ArgParser args) ...@@ -188,7 +188,8 @@ bool test_cast(ck_tile::ArgParser args)
ck_tile::stream_config sc{stream_}; ck_tile::stream_config sc{stream_};
HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal)); HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal));
for(int i_r = 0; i_r < repeat; i_r++) { for(int i_r = 0; i_r < repeat; i_r++)
{
elementwise(trait, karg, sc); elementwise(trait, karg, sc);
} }
HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_)); HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_));
...@@ -201,8 +202,9 @@ bool test_cast(ck_tile::ArgParser args) ...@@ -201,8 +202,9 @@ bool test_cast(ck_tile::ArgParser args)
HIP_CHECK_ERROR(hipEventCreate(&start_)); HIP_CHECK_ERROR(hipEventCreate(&start_));
HIP_CHECK_ERROR(hipEventCreate(&stop_)); HIP_CHECK_ERROR(hipEventCreate(&stop_));
//warm-up // warm-up
for(int i_r = 0; i_r < warpup; i_r++) { for(int i_r = 0; i_r < warpup; i_r++)
{
elementwise(trait, karg, sc); elementwise(trait, karg, sc);
} }
HIP_CHECK_ERROR(hipDeviceSynchronize()); HIP_CHECK_ERROR(hipDeviceSynchronize());
...@@ -225,12 +227,17 @@ bool test_cast(ck_tile::ArgParser args) ...@@ -225,12 +227,17 @@ bool test_cast(ck_tile::ArgParser args)
ms = total_time / repeat; ms = total_time / repeat;
} }
#endif #endif
auto gbps = [&](){ auto gbps = [&]() {
double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType); double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType);
return total_bytes / 1.E6 / ms; return total_bytes / 1.E6 / ms;
}(); }();
printf( printf("[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, ",
"[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms*1e6, ms, gbps); input_prec.c_str(),
output_prec.c_str(),
num_pixels,
ms * 1e6,
ms,
gbps);
if(ms < 0) if(ms < 0)
printf("not supported\n"); printf("not supported\n");
fflush(stdout); fflush(stdout);
......
...@@ -12,20 +12,22 @@ struct Cast ...@@ -12,20 +12,22 @@ struct Cast
} // namespace impl } // namespace impl
#define DISPATCH_E_CAST_(d_type_, s_type_, byte_per_issue_, chunks_, bs_) \ #define DISPATCH_E_CAST_(d_type_, s_type_, byte_per_issue_, chunks_, bs_) \
using src_t = s_type_; \ using src_t = s_type_; \
using dst_t = d_type_; \ using dst_t = d_type_; \
using u_fun = typename impl::Cast; \ using u_fun = typename impl::Cast; \
using problem = \ using problem = ck_tile:: \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \ ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \ using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \
using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \ using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \
\ \
auto kargs = kernel::MakeKargs(a); \ auto kargs = kernel::MakeKargs(a); \
const dim3 grids = kernel::GridSize(a); \ const dim3 grids = kernel::GridSize(a); \
constexpr dim3 blocks = kernel::BlockSize(); \ constexpr dim3 blocks = kernel::BlockSize(); \
\ \
float ave_time = ck_tile::launch_kernel( \ float ave_time = ck_tile::launch_kernel( \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \ s, \
ck_tile::make_kernel<blocks.x, 1>( \
kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
return ave_time; return ave_time;
float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s) float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
...@@ -36,49 +38,63 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf ...@@ -36,49 +38,63 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
if(t.output_type == "fp32" && t.input_type == "fp16") if(t.output_type == "fp32" && t.input_type == "fp16")
{ {
constexpr int eb = sizeof(ck_tile::fp16_t); constexpr int eb = sizeof(ck_tile::fp16_t);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) { if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 64) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 64)
} }
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) { else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 128) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 128)
} }
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) { else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 256) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
} }
else if (a.num_pixels % 4 == 0) { else if(a.num_pixels % 4 == 0)
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) { {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256) DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256)
} }
else { else
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256) DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256)
} }
} }
else { else
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256) DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
} }
} }
else if(t.output_type == "fp16" && t.input_type == "fp32") else if(t.output_type == "fp16" && t.input_type == "fp32")
{ {
constexpr int eb = sizeof(float); constexpr int eb = sizeof(float);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) { if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 64) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 64)
} }
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) { else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 128) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 128)
} }
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) { else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
} }
else if (a.num_pixels % 4 == 0) { else if(a.num_pixels % 4 == 0)
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) { {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256) DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256)
} }
else { else
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256) DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256)
} }
} }
else { else
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
} }
} }
} }
......
...@@ -57,17 +57,15 @@ struct ElementwiseUnaryKernel ...@@ -57,17 +57,15 @@ struct ElementwiseUnaryKernel
CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; } CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
CK_TILE_DEVICE void operator()(const void* p_input_, CK_TILE_DEVICE void
void* p_output_, operator()(const void* p_input_, void* p_output_, uint64_t num_pixels_) const
uint64_t num_pixels_) const
{ {
uint64_t block_base = uint64_t block_base =
static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize; static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize;
uint64_t pixels_rem = num_pixels_ - block_base; uint64_t pixels_rem = num_pixels_ - block_base;
const auto input_window = [&]() { const auto input_window = [&]() {
const InputType* p_input = const InputType* p_input = reinterpret_cast<const InputType*>(p_input_) + block_base;
reinterpret_cast<const InputType*>(p_input_) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>( auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_input, p_input,
...@@ -79,8 +77,7 @@ struct ElementwiseUnaryKernel ...@@ -79,8 +77,7 @@ struct ElementwiseUnaryKernel
}(); }();
auto output_window = [&]() { auto output_window = [&]() {
OutputType* p_output = OutputType* p_output = reinterpret_cast<OutputType*>(p_output_) + block_base;
reinterpret_cast<OutputType*>(p_output_) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>( auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_output, p_output,
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#endif #endif
namespace ck_tile { namespace ck_tile {
template <typename Problem_, typename Policy_ = ElementwiseUnaryPolicy> template <typename Problem_, typename Policy_ = ElementwiseUnaryPolicy>
struct ElementwiseUnaryipeline struct ElementwiseUnaryipeline
{ {
...@@ -37,7 +37,8 @@ struct ElementwiseUnaryipeline ...@@ -37,7 +37,8 @@ struct ElementwiseUnaryipeline
static_for<0, Problem::Chunks, 1>{}([&](auto) { static_for<0, Problem::Chunks, 1>{}([&](auto) {
auto x = load_tile(inp_win); auto x = load_tile(inp_win);
auto y = make_static_distributed_tensor<typename Problem::OutputType>(x.get_tile_distribution()); auto y = make_static_distributed_tensor<typename Problem::OutputType>(
x.get_tile_distribution());
tile_elementwise_inout(UnaryFunctor{}, y, x); tile_elementwise_inout(UnaryFunctor{}, y, x);
store_tile(out_win, y); store_tile(out_win, y);
......
...@@ -28,10 +28,10 @@ struct ElementwiseUnaryWarpPerRowProblem ...@@ -28,10 +28,10 @@ struct ElementwiseUnaryWarpPerRowProblem
static constexpr index_t WarpSize = get_warp_size(); static constexpr index_t WarpSize = get_warp_size();
static_assert(BytesPerIssue % sizeof(InputType) == 0); static_assert(BytesPerIssue % sizeof(InputType) == 0);
static constexpr index_t VectorSize = BytesPerIssue / sizeof(InputType); static constexpr index_t VectorSize = BytesPerIssue / sizeof(InputType);
static constexpr index_t LanesPerRow = WarpSize; static constexpr index_t LanesPerRow = WarpSize;
static constexpr index_t WarpsPerBlock = BlockSize / LanesPerRow; static constexpr index_t WarpsPerBlock = BlockSize / LanesPerRow;
static constexpr index_t IssuesPerRow = 1; static constexpr index_t IssuesPerRow = 1;
#if 0 #if 0
static_assert(Experts % VectorSize == 0); static_assert(Experts % VectorSize == 0);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment