Commit e34eeb6d authored by carlushuang's avatar carlushuang
Browse files

update unary element-wise

parent 6252d207
# not using add_example_executable() to add this target, since we don't want this to have
# to be included in "make all/install/check"
add_executable(tile_example_elementwise EXCLUDE_FROM_ALL elementwise.cpp elementwise_api.cpp)
target_include_directories(tile_example_elementwise PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
target_compile_options(tile_example_elementwise PRIVATE -v --save-temps -Wno-gnu-line-marker)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include <iostream>
#include <numeric>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <time.h>
#include <unordered_set>
#include "ck_tile/core.hpp"
#include "elementwise_api.hpp"
#ifndef TEST_ELEMENTWISE_VERBOSE
#define TEST_ELEMENTWISE_VERBOSE 1
#endif
template <typename T>
void dump_host_tensor_2d(const ck_tile::HostTensor<T>& x)
{
auto len = x.get_lengths();
assert(len.size() == 2);
std::cout << "[";
for(size_t i = 0; i < len[0]; i++)
{
std::cout << i << ": [";
for(size_t j = 0; j < len[1]; j++)
{
if constexpr(std::is_same_v<T, ck_tile::fp16_t>)
{
auto v = ck_tile::type_convert<float>(x(std::vector<std::size_t>{i, j}));
std::cout << v;
if(j != len[1] - 1)
std::cout << ",";
}
else
{
std::cout << x(std::vector<std::size_t>{i, j}) << " ";
}
}
std::cout << "]";
if(i != len[0] - 1)
std::cout << ",";
else
std::cout << "]";
std::cout << std::endl;
}
std::cout << "--------------------" << std::endl;
}
struct Cast
{
template <typename DstType, typename SrcType>
CK_TILE_HOST_DEVICE void operator()(DstType& y, const SrcType& x) const
{
y = ck_tile::type_convert<DstType>(x);
};
};
// CPU reference
template <typename DstType, typename SrcType, typename UnaryF>
auto reference_elementwise_unary(const ck_tile::HostTensor<SrcType>& x)
{
using namespace ck_tile;
auto y = ck_tile::HostTensor<DstType>(x.get_lengths());
y.ForEach([&](auto& self, auto idx) { UnaryF{}(self(idx), x(idx)); });
return y;
}
// different threshold for different dtype
template <typename DataType>
auto get_elimit(std::string /*init_method*/)
{
double rtol = 1e-3;
double atol = 1e-3;
return ck_tile::make_tuple(rtol, atol);
}
template <>
auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
{
double rtol = 1e-2;
double atol = 1e-2;
return ck_tile::make_tuple(rtol, atol);
}
template <>
auto get_elimit<ck_tile::fp8_t>(std::string init_method)
{
if(init_method == "ui" || init_method == "ni")
{
unsigned max_rounding_point_distance = 0;
double atol = 2e-3;
return ck_tile::make_tuple(max_rounding_point_distance, atol);
}
else
{
unsigned max_rounding_point_distance = 1;
double atol = 0.0625;
return ck_tile::make_tuple(max_rounding_point_distance, atol);
}
}
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("v", "1", "weather do CPU validation or not")
.insert("op", "cast", "which elementwise operator to run")
.insert("pr_i", "fp16", "input precision")
.insert("pr_o", "fp32", "output precision")
.insert("n", "1000", "number of pixels to cast")
.insert("seed", "-1", "seed to be used, -1 means random every time")
.insert("kname", "0", "t to 1 will print kernel name");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
template <typename DstType, typename SrcType>
bool test_cast(ck_tile::ArgParser args)
{
int validate = args.get_int("v");
std::string input_prec = args.get_str("pr_i");
std::string output_prec = args.get_str("pr_o");
uint64_t num_pixels = args.get_uint64("n");
int seed = args.get_int("seed");
if(seed < 0)
{
seed = std::time(nullptr);
}
// tokens already considered batch size
ck_tile::HostTensor<SrcType> x_host({num_pixels});
ck_tile::HostTensor<DstType> y_host({num_pixels});
ck_tile::FillUniformDistribution<SrcType>{-5, 5, seed}(x_host);
ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_dev(y_host.get_element_space_size_in_bytes());
x_dev.ToDevice(x_host.data());
elementwise_trait trait = [&]() {
elementwise_trait t_;
t_.input_type = input_prec;
t_.output_type = output_prec;
t_.op = std::string("cast");
return t_;
}();
elementwise_kargs karg = [&]() {
elementwise_kargs a_;
a_.p_input = x_dev.GetDeviceBuffer();
a_.p_output = y_dev.GetDeviceBuffer();
a_.num_pixels = num_pixels;
return a_;
}();
#if TEST_ELEMENTWISE_VERBOSE
ck_tile::stream_config sc{nullptr, true};
// ck_tile::stream_config sc{nullptr};
auto ms = elementwise(trait, karg, sc);
printf(
"[cast] %s->%s, n:%lu, ms:%f, ", input_prec.c_str(), output_prec.c_str(), num_pixels, ms);
if(ms < 0)
printf("not supported\n");
fflush(stdout);
#else
ck_tile::stream_config sc{nullptr};
auto ms = elementwise_unary(trait, karg, sc);
#endif
if(ms < 0)
{
return false;
}
y_dev.FromDevice(y_host.data());
bool rtn = true;
if(validate)
{
// this host buffer will not copy to GPU, so no need use stride
auto y_ref = reference_elementwise_unary<DstType, SrcType, Cast>(x_host);
auto [rtol, atol] = get_elimit<SrcType>("");
rtn &= ck_tile::check_err(
y_host, y_ref, std::string("Value Error: Incorrect results!"), rtol, atol);
printf("valid:%s", rtn ? "y" : "n");
fflush(stdout);
}
#if TEST_ELEMENTWISE_VERBOSE
printf("\n");
fflush(stdout);
#endif
return rtn;
}
int main(int argc, char** argv)
{
auto [result, args] = create_args(argc, argv);
if(!result)
return -1;
std::string input_prec = args.get_str("pr_i");
std::string output_prec = args.get_str("pr_o");
std::string op = args.get_str("op");
bool r = true;
if(op.compare("cast") == 0)
{
if(input_prec.compare("fp16") == 0 && output_prec.compare("fp32") == 0)
{
r &= test_cast<float, ck_tile::fp16_t>(args);
}
else if(input_prec.compare("fp32") == 0 && output_prec.compare("fp16") == 0)
{
r &= test_cast<ck_tile::fp16_t, float>(args);
}
}
return r ? 0 : -1;
}
#include "elementwise_api.hpp"
namespace impl {
struct Cast
{
template <typename DstType, typename SrcType>
CK_TILE_HOST_DEVICE void operator()(DstType& y, const SrcType& x) const
{
y = ck_tile::type_convert<DstType>(x);
};
};
} // namespace impl
#define DISPATCH_ELEMENTWISE_CAST(d_type_, s_type_, byte_per_issue_, chunks_) \
using src_t = s_type_; \
using dst_t = d_type_; \
using u_fun = typename impl::Cast; \
using problem = \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_>; \
using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \
using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \
\
auto kargs = kernel::MakeKargs(a); \
const dim3 grids = kernel::GridSize(a); \
constexpr dim3 blocks = kernel::BlockSize(); \
\
float ave_time = ck_tile::launch_kernel( \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs)); \
return ave_time;
float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
{
float rtn = -1;
if(t.op == "cast")
{
if(t.output_type == "fp32" && t.input_type == "fp16")
{
DISPATCH_ELEMENTWISE_CAST(float, ck_tile::fp16_t, sizeof(ck_tile::fp16_t), 8)
}
else if(t.output_type == "fp16" && t.input_type == "fp32")
{
DISPATCH_ELEMENTWISE_CAST(ck_tile::fp16_t, float, sizeof(float), 8)
}
}
return rtn;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/elementwise_unary.hpp"
#include <string>
struct elementwise_trait
{
std::string input_type; // input type
std::string acc_type; // type to do intermediate computation
std::string output_type; // type to store out
std::string op;
};
struct elementwise_kargs : public ck_tile::ElementwiseUnaryHostArgs
{
};
float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s);
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp"
#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp"
#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp"
#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_problem.hpp"
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include <string>
#include <type_traits>
namespace ck_tile {
struct ElementwiseUnaryHostArgs
{
const void* p_input;
void* p_output;
uint64_t num_pixels;
};
template <typename Pipeline_>
struct ElementwiseUnaryKernel
{
using Pipeline = remove_cvref_t<Pipeline_>;
using Problem = remove_cvref_t<typename Pipeline::Problem>;
using InputType = typename Problem::InputType;
using OutputType = typename Problem::OutputType;
struct ElementwiseUnaryKargs
{
const void* p_input;
void* p_output;
uint64_t num_pixels;
};
using Kargs = ElementwiseUnaryKargs;
using Hargs = ElementwiseUnaryHostArgs;
CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
{
constexpr index_t issues_per_block =
Problem::BlockSize * Problem::Chunks * Problem::VectorSize;
index_t grids =
static_cast<index_t>((h.num_pixels + issues_per_block - 1) / issues_per_block);
return dim3(grids);
}
CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
{
Kargs k;
k.p_input = h.p_input;
k.p_output = h.p_output;
k.num_pixels = h.num_pixels;
return k;
}
CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
CK_TILE_DEVICE void operator()(Kargs kargs) const
{
uint64_t block_base =
static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize;
uint64_t pixels_rem = kargs.num_pixels - block_base;
const auto input_window = [&]() {
const InputType* p_input =
reinterpret_cast<const InputType*>(kargs.p_input) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_input,
make_tuple(static_cast<index_t>(pixels_rem)),
number<Problem::VectorSize>{});
return make_tile_window(
tmp, make_tuple(number<Problem::BlockSize * Problem::VectorSize>{}), {0});
}();
auto output_window = [&]() {
OutputType* p_output =
reinterpret_cast<OutputType*>(kargs.p_output) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_output,
make_tuple(static_cast<index_t>(pixels_rem)),
number<Problem::VectorSize>{});
return make_tile_window(
tmp, make_tuple(number<Problem::BlockSize * Problem::VectorSize>{}), {0});
}();
index_t loop_stride =
__builtin_amdgcn_readfirstlane(gridDim.x * Problem::BlockSize * Problem::VectorSize);
Pipeline{}(input_window, output_window, loop_stride);
}
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp"
#include <string>
#include <type_traits>
#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 1
#endif
namespace ck_tile {
template <typename Problem_, typename Policy_ = ElementwiseUnaryPolicy>
struct ElementwiseUnaryipeline
{
// TODO: this kernel only support warp per row
using Problem = remove_cvref_t<Problem_>;
using Policy = remove_cvref_t<Policy_>;
using UnaryFunctor = typename Problem::UnaryFunctor;
template <typename InputWindow, typename OutputWindow>
CK_TILE_DEVICE auto
operator()(const InputWindow& inp_window, OutputWindow& out_window, index_t loop_stride)
{
auto inp_win = make_tile_window(inp_window.get_bottom_tensor_view(),
inp_window.get_window_lengths(),
inp_window.get_window_origin(),
Policy::template MakeInputDistribution<Problem>());
auto out_win = make_tile_window(out_window.get_bottom_tensor_view(),
out_window.get_window_lengths(),
out_window.get_window_origin(),
Policy::template MakeOutputDistribution<Problem>());
static_for<0, Problem::Chunks, 1>{}([&](auto) {
auto x = load_tile(inp_win);
auto y = make_static_distributed_tensor<typename Problem::OutputType>(x.get_tile_distribution());
tile_elementwise_inout(UnaryFunctor{}, y, x);
store_tile(out_win, y);
move_tile_window(inp_win, {loop_stride});
move_tile_window(out_win, {loop_stride});
});
}
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace ck_tile {
struct ElementwiseUnaryPolicy
{
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution()
{
// TODO: Y dim must have one dim that is not reduced
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<Problem::IssuesPerRow,
Problem::WarpsPerBlock,
Problem::LanesPerRow,
Problem::VectorSize>>,
tuple<sequence<1>, sequence<1>>,
tuple<sequence<1>, sequence<2>>,
sequence<1, 1>,
sequence<0, 3>>{});
}
template <typename Problem>
CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution()
{
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<Problem::IssuesPerRow,
Problem::WarpsPerBlock,
Problem::LanesPerRow,
Problem::VectorSize>>,
tuple<sequence<1>, sequence<1>>,
tuple<sequence<1>, sequence<2>>,
sequence<1, 1>,
sequence<0, 3>>{});
}
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include <string>
#include <type_traits>
namespace ck_tile {
template <typename InputType_,
typename OutputType_,
typename UnaryFunctor_,
index_t BytesPerIssue_ = sizeof(InputType_), // this is input
index_t Chunks_ = 8,
index_t BlockSize_ = 256>
struct ElementwiseUnaryWarpPerRowProblem
{
// TODO: this kernel only support warp per row
using InputType = remove_cvref_t<InputType_>;
using OutputType = remove_cvref_t<OutputType_>;
using UnaryFunctor = remove_cvref_t<UnaryFunctor_>;
static constexpr index_t Chunks = Chunks_;
static constexpr index_t BytesPerIssue = BytesPerIssue_;
static constexpr index_t BlockSize = BlockSize_;
static constexpr index_t WarpSize = get_warp_size();
static_assert(BytesPerIssue % sizeof(InputType) == 0);
static constexpr index_t VectorSize = BytesPerIssue / sizeof(InputType);
static constexpr index_t LanesPerRow = WarpSize;
static constexpr index_t WarpsPerBlock = BlockSize / LanesPerRow;
static constexpr index_t IssuesPerRow = 1;
#if 0
static_assert(Experts % VectorSize == 0);
static constexpr index_t LanesPerRow = min(Experts / VectorSize, WarpSize);
static_assert(WarpSize % LanesPerRow == 0);
static constexpr index_t RowsPerWarpPerColIssue = WarpSize / LanesPerRow;
static constexpr index_t RowsPerWarp = IssuesPerCol * RowsPerWarpPerColIssue;
static constexpr index_t IssuesPerRow = Experts / (LanesPerRow * VectorSize);
static constexpr index_t WarpsPerBlock = BlockSize / WarpSize;
static constexpr index_t RowsPerBlock = RowsPerWarp * WarpsPerBlock;
#endif
};
} // namespace ck_tile
import pathlib
from pathlib import Path
import subprocess
import os
import copy
NS = 'ck_tile'
OPS = 'ops'
OPS_COMMON = 'common' # common header will be duplicated into ops/* other module
HEADER_COMMON = """// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
"""
# aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp)
def get_module(f, level = 0):
all_parts = f.parts
return str(all_parts[level])
all_files = []
for p in sorted(Path("./").rglob("*")):
if p.suffix == '.hpp':
all_files.append(pathlib.PurePath(p))
class submodule_t:
def __init__(self):
self.m = dict()
def push(self, f):
if len(f.parents) != 1: # ignore ./xxx.hpp
mod = get_module(f)
if mod == OPS:
if mod not in self.m.keys():
self.m[mod] = dict()
mod2 = get_module(f, 1)
if Path(mod2).suffix != '.hpp':
# ignore ops/xxx.hpp
if mod2 not in self.m[mod].keys():
self.m[mod][mod2] = list()
self.m[mod][mod2].append(f)
else:
if mod not in self.m.keys():
self.m[mod] = list()
self.m[mod].append(f)
def gen(self):
def gen_header(hpath, include_list):
# print(hpath)
if os.path.exists(str(hpath)):
os.remove(str(hpath))
with hpath.open('w') as f:
f.write(HEADER_COMMON)
f.write('#pragma once\n')
f.write('\n')
for individual_header in include_list:
header_path = NS + '/' + str(individual_header)
f.write(f'#include \"{header_path}\"\n')
# f.write('\n') # otherwise clang-format will complain
# print(self.m)
# restructure common
for k, v in self.m.items():
if k == OPS and OPS_COMMON in v.keys():
common_list = copy.deepcopy(v[OPS_COMMON])
# v.pop(OPS_COMMON)
for km in v.keys():
if km != OPS_COMMON:
v[km].extend(common_list)
for k, v in self.m.items():
if k == OPS:
for km, kv in v.items():
gen_header(Path(k) / (f'{km}.hpp'), kv)
else:
gen_header(Path(f'{k}.hpp'), v)
submodule = submodule_t()
# formatting
for x in all_files:
subprocess.Popen(f'dos2unix {str(x)}', shell=True)
cmd = f'clang-format-12 -style=file -i {str(x)}'
#for xp in x.parents:
#print(get_file_base(x))
subprocess.Popen(cmd, shell=True)
submodule.push(x)
submodule.gen()
#print(all_files)
...@@ -6,3 +6,4 @@ add_subdirectory(01_fmha) ...@@ -6,3 +6,4 @@ add_subdirectory(01_fmha)
add_subdirectory(02_layernorm2d) add_subdirectory(02_layernorm2d)
add_subdirectory(03_gemm) add_subdirectory(03_gemm)
add_subdirectory(06_permute) add_subdirectory(06_permute)
add_subdirectory(19_elementwise)
...@@ -1079,6 +1079,15 @@ struct ConvScaleRelu ...@@ -1079,6 +1079,15 @@ struct ConvScaleRelu
float scale_out_; float scale_out_;
}; };
template<typename DstType, typename SrcType>
struct Cast {
template <typename T>
CK_TILE_HOST_DEVICE void operator()(DstType& y, const SrcType& x) const
{
y = ck_tile::type_convert<DstType>(x);
};
};
// support fastconvert of int8 to fp16 // support fastconvert of int8 to fp16
#if 0 #if 0
template <typename InputDataType, typename OutputDataType, index_t RegPackNumber> template <typename InputDataType, typename OutputDataType, index_t RegPackNumber>
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include <type_traits> #include <type_traits>
#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW #ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 0 #define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 1
#endif #endif
namespace ck_tile { namespace ck_tile {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment