Commit 50ba9c44 authored by carlushuang's avatar carlushuang
Browse files

mofisy karg

parent d7e0f7e2
......@@ -2,4 +2,4 @@
# to be included in "make all/install/check"
add_executable(tile_example_elementwise EXCLUDE_FROM_ALL elementwise.cpp elementwise_api.cpp)
target_include_directories(tile_example_elementwise PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
target_compile_options(tile_example_elementwise PRIVATE -v --save-temps -Wno-gnu-line-marker)
target_compile_options(tile_example_elementwise PRIVATE -v --save-temps -Wno-gnu-line-marker -mllvm --amdgpu-kernarg-preload-count=16)
......@@ -25,7 +25,7 @@ struct Cast
constexpr dim3 blocks = kernel::BlockSize(); \
\
float ave_time = ck_tile::launch_kernel( \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs)); \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
return ave_time;
float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_config s)
......@@ -35,11 +35,11 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
{
if(t.output_type == "fp32" && t.input_type == "fp16")
{
DISPATCH_ELEMENTWISE_CAST(float, ck_tile::fp16_t, sizeof(ck_tile::fp16_t), 8)
DISPATCH_ELEMENTWISE_CAST(float, ck_tile::fp16_t, 8*sizeof(ck_tile::fp16_t), 8)
}
else if(t.output_type == "fp16" && t.input_type == "fp32")
{
DISPATCH_ELEMENTWISE_CAST(ck_tile::fp16_t, float, sizeof(float), 8)
DISPATCH_ELEMENTWISE_CAST(ck_tile::fp16_t, float, 4*sizeof(float), 8)
}
}
return rtn;
......
......@@ -57,15 +57,17 @@ struct ElementwiseUnaryKernel
CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::BlockSize; }
CK_TILE_DEVICE void operator()(Kargs kargs) const
CK_TILE_DEVICE void operator()(const void* p_input_,
void* p_output_,
uint64_t num_pixels_) const
{
uint64_t block_base =
static_cast<uint64_t>(blockIdx.x) * Problem::BlockSize * Problem::VectorSize;
uint64_t pixels_rem = kargs.num_pixels - block_base;
uint64_t pixels_rem = num_pixels_ - block_base;
const auto input_window = [&]() {
const InputType* p_input =
reinterpret_cast<const InputType*>(kargs.p_input) + block_base;
reinterpret_cast<const InputType*>(p_input_) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_input,
......@@ -78,7 +80,7 @@ struct ElementwiseUnaryKernel
auto output_window = [&]() {
OutputType* p_output =
reinterpret_cast<OutputType*>(kargs.p_output) + block_base;
reinterpret_cast<OutputType*>(p_output_) + block_base;
auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
p_output,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment