"test/vscode:/vscode.git/clone" did not exist on "b81f0b459f83171cb3498aa8316a9fce751407fe"
Commit e20ed766 authored by carlushuang's avatar carlushuang
Browse files

format

parent 1e95a6e2
...@@ -188,7 +188,8 @@ bool test_cast(ck_tile::ArgParser args) ...@@ -188,7 +188,8 @@ bool test_cast(ck_tile::ArgParser args)
ck_tile::stream_config sc{stream_}; ck_tile::stream_config sc{stream_};
HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal)); HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal));
for(int i_r = 0; i_r < repeat; i_r++) { for(int i_r = 0; i_r < repeat; i_r++)
{
elementwise(trait, karg, sc); elementwise(trait, karg, sc);
} }
HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_)); HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_));
...@@ -201,8 +202,9 @@ bool test_cast(ck_tile::ArgParser args) ...@@ -201,8 +202,9 @@ bool test_cast(ck_tile::ArgParser args)
HIP_CHECK_ERROR(hipEventCreate(&start_)); HIP_CHECK_ERROR(hipEventCreate(&start_));
HIP_CHECK_ERROR(hipEventCreate(&stop_)); HIP_CHECK_ERROR(hipEventCreate(&stop_));
//warm-up // warm-up
for(int i_r = 0; i_r < warpup; i_r++) { for(int i_r = 0; i_r < warpup; i_r++)
{
elementwise(trait, karg, sc); elementwise(trait, karg, sc);
} }
HIP_CHECK_ERROR(hipDeviceSynchronize()); HIP_CHECK_ERROR(hipDeviceSynchronize());
...@@ -225,12 +227,17 @@ bool test_cast(ck_tile::ArgParser args) ...@@ -225,12 +227,17 @@ bool test_cast(ck_tile::ArgParser args)
ms = total_time / repeat; ms = total_time / repeat;
} }
#endif #endif
auto gbps = [&](){ auto gbps = [&]() {
double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType); double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType);
return total_bytes / 1.E6 / ms; return total_bytes / 1.E6 / ms;
}(); }();
printf( printf("[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, ",
"[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms*1e6, ms, gbps); input_prec.c_str(),
output_prec.c_str(),
num_pixels,
ms * 1e6,
ms,
gbps);
if(ms < 0) if(ms < 0)
printf("not supported\n"); printf("not supported\n");
fflush(stdout); fflush(stdout);
......
...@@ -15,8 +15,8 @@ struct Cast ...@@ -15,8 +15,8 @@ struct Cast
using src_t = s_type_; \ using src_t = s_type_; \
using dst_t = d_type_; \ using dst_t = d_type_; \
using u_fun = typename impl::Cast; \ using u_fun = typename impl::Cast; \
using problem = \ using problem = ck_tile:: \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \ ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \ using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \
using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \ using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \
\ \
...@@ -25,7 +25,9 @@ struct Cast ...@@ -25,7 +25,9 @@ struct Cast
constexpr dim3 blocks = kernel::BlockSize(); \ constexpr dim3 blocks = kernel::BlockSize(); \
\ \
float ave_time = ck_tile::launch_kernel( \ float ave_time = ck_tile::launch_kernel( \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \ s, \
ck_tile::make_kernel<blocks.x, 1>( \
kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
return ave_time; return ave_time;
float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s) float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
...@@ -36,49 +38,63 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf ...@@ -36,49 +38,63 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
if(t.output_type == "fp32" && t.input_type == "fp16") if(t.output_type == "fp32" && t.input_type == "fp16")
{ {
constexpr int eb = sizeof(ck_tile::fp16_t); constexpr int eb = sizeof(ck_tile::fp16_t);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) { if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 64) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 64)
} }
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) { else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 128) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 128)
} }
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) { else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 256) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
} }
else if (a.num_pixels % 4 == 0) { else if(a.num_pixels % 4 == 0)
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) { {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256) DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256)
} }
else { else
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256) DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256)
} }
} }
else { else
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256) DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
} }
} }
else if(t.output_type == "fp16" && t.input_type == "fp32") else if(t.output_type == "fp16" && t.input_type == "fp32")
{ {
constexpr int eb = sizeof(float); constexpr int eb = sizeof(float);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) { if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 64) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 64)
} }
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) { else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 128) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 128)
} }
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) { else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
} }
else if (a.num_pixels % 4 == 0) { else if(a.num_pixels % 4 == 0)
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) { {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256) DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256)
} }
else { else
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256) DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256)
} }
} }
else { else
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
} }
} }
} }
......
...@@ -57,17 +57,15 @@ struct ElementwiseUnaryKernel ...@@ -57,17 +57,15 @@ struct ElementwiseUnaryKernel
CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; } CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
CK_TILE_DEVICE void operator()(const void* p_input_, CK_TILE_DEVICE void
void* p_output_, operator()(const void* p_input_, void* p_output_, uint64_t num_pixels_) const
uint64_t num_pixels_) const
{ {
uint64_t block_base = uint64_t block_base =
static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize; static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize;
uint64_t pixels_rem = num_pixels_ - block_base; uint64_t pixels_rem = num_pixels_ - block_base;
const auto input_window = [&]() { const auto input_window = [&]() {
const InputType* p_input = const InputType* p_input = reinterpret_cast<const InputType*>(p_input_) + block_base;
reinterpret_cast<const InputType*>(p_input_) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>( auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_input, p_input,
...@@ -79,8 +77,7 @@ struct ElementwiseUnaryKernel ...@@ -79,8 +77,7 @@ struct ElementwiseUnaryKernel
}(); }();
auto output_window = [&]() { auto output_window = [&]() {
OutputType* p_output = OutputType* p_output = reinterpret_cast<OutputType*>(p_output_) + block_base;
reinterpret_cast<OutputType*>(p_output_) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>( auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_output, p_output,
......
...@@ -37,7 +37,8 @@ struct ElementwiseUnaryipeline ...@@ -37,7 +37,8 @@ struct ElementwiseUnaryipeline
static_for<0, Problem::Chunks, 1>{}([&](auto) { static_for<0, Problem::Chunks, 1>{}([&](auto) {
auto x = load_tile(inp_win); auto x = load_tile(inp_win);
auto y = make_static_distributed_tensor<typename Problem::OutputType>(x.get_tile_distribution()); auto y = make_static_distributed_tensor<typename Problem::OutputType>(
x.get_tile_distribution());
tile_elementwise_inout(UnaryFunctor{}, y, x); tile_elementwise_inout(UnaryFunctor{}, y, x);
store_tile(out_win, y); store_tile(out_win, y);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment