Commit e20ed766 authored by carlushuang's avatar carlushuang
Browse files

format

parent 1e95a6e2
......@@ -188,7 +188,8 @@ bool test_cast(ck_tile::ArgParser args)
ck_tile::stream_config sc{stream_};
HIP_CHECK_ERROR(hipStreamBeginCapture(sc.stream_id_, hipStreamCaptureModeGlobal));
for(int i_r = 0; i_r < repeat; i_r++) {
for(int i_r = 0; i_r < repeat; i_r++)
{
elementwise(trait, karg, sc);
}
HIP_CHECK_ERROR(hipStreamEndCapture(sc.stream_id_, &graph_));
......@@ -201,8 +202,9 @@ bool test_cast(ck_tile::ArgParser args)
HIP_CHECK_ERROR(hipEventCreate(&start_));
HIP_CHECK_ERROR(hipEventCreate(&stop_));
//warm-up
for(int i_r = 0; i_r < warpup; i_r++) {
// warm-up
for(int i_r = 0; i_r < warpup; i_r++)
{
elementwise(trait, karg, sc);
}
HIP_CHECK_ERROR(hipDeviceSynchronize());
......@@ -225,12 +227,17 @@ bool test_cast(ck_tile::ArgParser args)
ms = total_time / repeat;
}
#endif
auto gbps = [&](){
auto gbps = [&]() {
double total_bytes = num_pixels * sizeof(SrcType) + num_pixels * sizeof(DstType);
return total_bytes / 1.E6 / ms;
}();
printf(
"[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms*1e6, ms, gbps);
printf("[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, ",
input_prec.c_str(),
output_prec.c_str(),
num_pixels,
ms * 1e6,
ms,
gbps);
if(ms < 0)
printf("not supported\n");
fflush(stdout);
......
......@@ -15,8 +15,8 @@ struct Cast
using src_t = s_type_; \
using dst_t = d_type_; \
using u_fun = typename impl::Cast; \
using problem = \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
using problem = ck_tile:: \
ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \
using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \
\
......@@ -25,7 +25,9 @@ struct Cast
constexpr dim3 blocks = kernel::BlockSize(); \
\
float ave_time = ck_tile::launch_kernel( \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
s, \
ck_tile::make_kernel<blocks.x, 1>( \
kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
return ave_time;
float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
......@@ -36,49 +38,63 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
if(t.output_type == "fp32" && t.input_type == "fp16")
{
constexpr int eb = sizeof(ck_tile::fp16_t);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 64)
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 64)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 128)
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 128)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1*eb, 1, 256)
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
}
else if (a.num_pixels % 4 == 0) {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
else if(a.num_pixels % 4 == 0)
{
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 1, 256)
}
else {
else
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 4 * eb, 8, 256)
}
}
else {
else
{
DISPATCH_E_CAST_(float, ck_tile::fp16_t, 1 * eb, 1, 256)
}
}
else if(t.output_type == "fp16" && t.input_type == "fp32")
{
constexpr int eb = sizeof(float);
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 64)
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 64))
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 64)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 128)
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 128))
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 128)
}
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3)) {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
else if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 3))
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
}
else if (a.num_pixels % 4 == 0) {
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8)) {
else if(a.num_pixels % 4 == 0)
{
if(a.num_pixels < (static_cast<uint64_t>(t.num_cu) * 256 * 4 * 8))
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 1, 256)
}
else {
else
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 4 * eb, 8, 256)
}
}
else {
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1*eb, 1, 256)
else
{
DISPATCH_E_CAST_(ck_tile::fp16_t, float, 1 * eb, 1, 256)
}
}
}
......
......@@ -57,17 +57,15 @@ struct ElementwiseUnaryKernel
CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
CK_TILE_DEVICE void operator()(const void* p_input_,
void* p_output_,
uint64_t num_pixels_) const
CK_TILE_DEVICE void
operator()(const void* p_input_, void* p_output_, uint64_t num_pixels_) const
{
uint64_t block_base =
static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize;
uint64_t pixels_rem = num_pixels_ - block_base;
const auto input_window = [&]() {
const InputType* p_input =
reinterpret_cast<const InputType*>(p_input_) + block_base;
const InputType* p_input = reinterpret_cast<const InputType*>(p_input_) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_input,
......@@ -79,8 +77,7 @@ struct ElementwiseUnaryKernel
}();
auto output_window = [&]() {
OutputType* p_output =
reinterpret_cast<OutputType*>(p_output_) + block_base;
OutputType* p_output = reinterpret_cast<OutputType*>(p_output_) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_output,
......
......@@ -37,7 +37,8 @@ struct ElementwiseUnaryipeline
static_for<0, Problem::Chunks, 1>{}([&](auto) {
auto x = load_tile(inp_win);
auto y = make_static_distributed_tensor<typename Problem::OutputType>(x.get_tile_distribution());
auto y = make_static_distributed_tensor<typename Problem::OutputType>(
x.get_tile_distribution());
tile_elementwise_inout(UnaryFunctor{}, y, x);
store_tile(out_win, y);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment