Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
e34eeb6d
Commit
e34eeb6d
authored
Sep 13, 2024
by
carlushuang
Browse files
update unary element-wise
parent
6252d207
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
647 additions
and
1 deletion
+647
-1
example/ck_tile/19_elementwise/CMakeLists.txt
example/ck_tile/19_elementwise/CMakeLists.txt
+5
-0
example/ck_tile/19_elementwise/elementwise.cpp
example/ck_tile/19_elementwise/elementwise.cpp
+227
-0
example/ck_tile/19_elementwise/elementwise_api.cpp
example/ck_tile/19_elementwise/elementwise_api.cpp
+46
-0
example/ck_tile/19_elementwise/elementwise_api.hpp
example/ck_tile/19_elementwise/elementwise_api.hpp
+22
-0
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary.hpp
.../19_elementwise/include/ck_tile/ops/elementwise_unary.hpp
+9
-0
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
...ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
+98
-0
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
...elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
+50
-0
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp
...s/elementwise_unary/pipeline/elementwise_unary_policy.hpp
+43
-0
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_problem.hpp
.../elementwise_unary/pipeline/elementwise_unary_problem.hpp
+48
-0
example/ck_tile/19_elementwise/include/ck_tile/remod.py
example/ck_tile/19_elementwise/include/ck_tile/remod.py
+88
-0
example/ck_tile/CMakeLists.txt
example/ck_tile/CMakeLists.txt
+1
-0
include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
.../ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+9
-0
include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
...k_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
+1
-1
No files found.
example/ck_tile/19_elementwise/CMakeLists.txt
0 → 100644
View file @
e34eeb6d
# not using add_example_executable() to add this target, since we don't want this to have
# to be included in "make all/install/check"
add_executable
(
tile_example_elementwise EXCLUDE_FROM_ALL elementwise.cpp elementwise_api.cpp
)
target_include_directories
(
tile_example_elementwise PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/include
)
target_compile_options
(
tile_example_elementwise PRIVATE -v --save-temps -Wno-gnu-line-marker
)
example/ck_tile/19_elementwise/elementwise.cpp
0 → 100644
View file @
e34eeb6d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include <iostream>
#include <numeric>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <time.h>
#include <unordered_set>
#include "ck_tile/core.hpp"
#include "elementwise_api.hpp"
#ifndef TEST_ELEMENTWISE_VERBOSE
#define TEST_ELEMENTWISE_VERBOSE 1
#endif
template
<
typename
T
>
void
dump_host_tensor_2d
(
const
ck_tile
::
HostTensor
<
T
>&
x
)
{
auto
len
=
x
.
get_lengths
();
assert
(
len
.
size
()
==
2
);
std
::
cout
<<
"["
;
for
(
size_t
i
=
0
;
i
<
len
[
0
];
i
++
)
{
std
::
cout
<<
i
<<
": ["
;
for
(
size_t
j
=
0
;
j
<
len
[
1
];
j
++
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
ck_tile
::
fp16_t
>
)
{
auto
v
=
ck_tile
::
type_convert
<
float
>
(
x
(
std
::
vector
<
std
::
size_t
>
{
i
,
j
}));
std
::
cout
<<
v
;
if
(
j
!=
len
[
1
]
-
1
)
std
::
cout
<<
","
;
}
else
{
std
::
cout
<<
x
(
std
::
vector
<
std
::
size_t
>
{
i
,
j
})
<<
" "
;
}
}
std
::
cout
<<
"]"
;
if
(
i
!=
len
[
0
]
-
1
)
std
::
cout
<<
","
;
else
std
::
cout
<<
"]"
;
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
"--------------------"
<<
std
::
endl
;
}
struct
Cast
{
template
<
typename
DstType
,
typename
SrcType
>
CK_TILE_HOST_DEVICE
void
operator
()(
DstType
&
y
,
const
SrcType
&
x
)
const
{
y
=
ck_tile
::
type_convert
<
DstType
>
(
x
);
};
};
// CPU reference
template
<
typename
DstType
,
typename
SrcType
,
typename
UnaryF
>
auto
reference_elementwise_unary
(
const
ck_tile
::
HostTensor
<
SrcType
>&
x
)
{
using
namespace
ck_tile
;
auto
y
=
ck_tile
::
HostTensor
<
DstType
>
(
x
.
get_lengths
());
y
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
UnaryF
{}(
self
(
idx
),
x
(
idx
));
});
return
y
;
}
// different threshold for different dtype
template
<
typename
DataType
>
auto
get_elimit
(
std
::
string
/*init_method*/
)
{
double
rtol
=
1e-3
;
double
atol
=
1e-3
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
template
<
>
auto
get_elimit
<
ck_tile
::
bf16_t
>
(
std
::
string
/*init_method*/
)
{
double
rtol
=
1e-2
;
double
atol
=
1e-2
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
template
<
>
auto
get_elimit
<
ck_tile
::
fp8_t
>
(
std
::
string
init_method
)
{
if
(
init_method
==
"ui"
||
init_method
==
"ni"
)
{
unsigned
max_rounding_point_distance
=
0
;
double
atol
=
2e-3
;
return
ck_tile
::
make_tuple
(
max_rounding_point_distance
,
atol
);
}
else
{
unsigned
max_rounding_point_distance
=
1
;
double
atol
=
0.0625
;
return
ck_tile
::
make_tuple
(
max_rounding_point_distance
,
atol
);
}
}
auto
create_args
(
int
argc
,
char
*
argv
[])
{
ck_tile
::
ArgParser
arg_parser
;
arg_parser
.
insert
(
"v"
,
"1"
,
"weather do CPU validation or not"
)
.
insert
(
"op"
,
"cast"
,
"which elementwise operator to run"
)
.
insert
(
"pr_i"
,
"fp16"
,
"input precision"
)
.
insert
(
"pr_o"
,
"fp32"
,
"output precision"
)
.
insert
(
"n"
,
"1000"
,
"number of pixels to cast"
)
.
insert
(
"seed"
,
"-1"
,
"seed to be used, -1 means random every time"
)
.
insert
(
"kname"
,
"0"
,
"t to 1 will print kernel name"
);
bool
result
=
arg_parser
.
parse
(
argc
,
argv
);
return
std
::
make_tuple
(
result
,
arg_parser
);
}
template
<
typename
DstType
,
typename
SrcType
>
bool
test_cast
(
ck_tile
::
ArgParser
args
)
{
int
validate
=
args
.
get_int
(
"v"
);
std
::
string
input_prec
=
args
.
get_str
(
"pr_i"
);
std
::
string
output_prec
=
args
.
get_str
(
"pr_o"
);
uint64_t
num_pixels
=
args
.
get_uint64
(
"n"
);
int
seed
=
args
.
get_int
(
"seed"
);
if
(
seed
<
0
)
{
seed
=
std
::
time
(
nullptr
);
}
// tokens already considered batch size
ck_tile
::
HostTensor
<
SrcType
>
x_host
({
num_pixels
});
ck_tile
::
HostTensor
<
DstType
>
y_host
({
num_pixels
});
ck_tile
::
FillUniformDistribution
<
SrcType
>
{
-
5
,
5
,
seed
}(
x_host
);
ck_tile
::
DeviceMem
x_dev
(
x_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
y_dev
(
y_host
.
get_element_space_size_in_bytes
());
x_dev
.
ToDevice
(
x_host
.
data
());
elementwise_trait
trait
=
[
&
]()
{
elementwise_trait
t_
;
t_
.
input_type
=
input_prec
;
t_
.
output_type
=
output_prec
;
t_
.
op
=
std
::
string
(
"cast"
);
return
t_
;
}();
elementwise_kargs
karg
=
[
&
]()
{
elementwise_kargs
a_
;
a_
.
p_input
=
x_dev
.
GetDeviceBuffer
();
a_
.
p_output
=
y_dev
.
GetDeviceBuffer
();
a_
.
num_pixels
=
num_pixels
;
return
a_
;
}();
#if TEST_ELEMENTWISE_VERBOSE
ck_tile
::
stream_config
sc
{
nullptr
,
true
};
// ck_tile::stream_config sc{nullptr};
auto
ms
=
elementwise
(
trait
,
karg
,
sc
);
printf
(
"[cast] %s->%s, n:%lu, ms:%f, "
,
input_prec
.
c_str
(),
output_prec
.
c_str
(),
num_pixels
,
ms
);
if
(
ms
<
0
)
printf
(
"not supported
\n
"
);
fflush
(
stdout
);
#else
ck_tile
::
stream_config
sc
{
nullptr
};
auto
ms
=
elementwise_unary
(
trait
,
karg
,
sc
);
#endif
if
(
ms
<
0
)
{
return
false
;
}
y_dev
.
FromDevice
(
y_host
.
data
());
bool
rtn
=
true
;
if
(
validate
)
{
// this host buffer will not copy to GPU, so no need use stride
auto
y_ref
=
reference_elementwise_unary
<
DstType
,
SrcType
,
Cast
>
(
x_host
);
auto
[
rtol
,
atol
]
=
get_elimit
<
SrcType
>
(
""
);
rtn
&=
ck_tile
::
check_err
(
y_host
,
y_ref
,
std
::
string
(
"Value Error: Incorrect results!"
),
rtol
,
atol
);
printf
(
"valid:%s"
,
rtn
?
"y"
:
"n"
);
fflush
(
stdout
);
}
#if TEST_ELEMENTWISE_VERBOSE
printf
(
"
\n
"
);
fflush
(
stdout
);
#endif
return
rtn
;
}
int
main
(
int
argc
,
char
**
argv
)
{
auto
[
result
,
args
]
=
create_args
(
argc
,
argv
);
if
(
!
result
)
return
-
1
;
std
::
string
input_prec
=
args
.
get_str
(
"pr_i"
);
std
::
string
output_prec
=
args
.
get_str
(
"pr_o"
);
std
::
string
op
=
args
.
get_str
(
"op"
);
bool
r
=
true
;
if
(
op
.
compare
(
"cast"
)
==
0
)
{
if
(
input_prec
.
compare
(
"fp16"
)
==
0
&&
output_prec
.
compare
(
"fp32"
)
==
0
)
{
r
&=
test_cast
<
float
,
ck_tile
::
fp16_t
>
(
args
);
}
else
if
(
input_prec
.
compare
(
"fp32"
)
==
0
&&
output_prec
.
compare
(
"fp16"
)
==
0
)
{
r
&=
test_cast
<
ck_tile
::
fp16_t
,
float
>
(
args
);
}
}
return
r
?
0
:
-
1
;
}
example/ck_tile/19_elementwise/elementwise_api.cpp
0 → 100644
View file @
e34eeb6d
#include "elementwise_api.hpp"
namespace
impl
{
struct
Cast
{
template
<
typename
DstType
,
typename
SrcType
>
CK_TILE_HOST_DEVICE
void
operator
()(
DstType
&
y
,
const
SrcType
&
x
)
const
{
y
=
ck_tile
::
type_convert
<
DstType
>
(
x
);
};
};
}
// namespace impl
#define DISPATCH_ELEMENTWISE_CAST(d_type_, s_type_, byte_per_issue_, chunks_) \
using src_t = s_type_; \
using dst_t = d_type_; \
using u_fun = typename impl::Cast; \
using problem = \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_>; \
using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \
using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \
\
auto kargs = kernel::MakeKargs(a); \
const dim3 grids = kernel::GridSize(a); \
constexpr dim3 blocks = kernel::BlockSize(); \
\
float ave_time = ck_tile::launch_kernel( \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs)); \
return ave_time;
float
elementwise
(
elementwise_trait
t
,
elementwise_kargs
a
,
ck_tile
::
stream_config
s
)
{
float
rtn
=
-
1
;
if
(
t
.
op
==
"cast"
)
{
if
(
t
.
output_type
==
"fp32"
&&
t
.
input_type
==
"fp16"
)
{
DISPATCH_ELEMENTWISE_CAST
(
float
,
ck_tile
::
fp16_t
,
sizeof
(
ck_tile
::
fp16_t
),
8
)
}
else
if
(
t
.
output_type
==
"fp16"
&&
t
.
input_type
==
"fp32"
)
{
DISPATCH_ELEMENTWISE_CAST
(
ck_tile
::
fp16_t
,
float
,
sizeof
(
float
),
8
)
}
}
return
rtn
;
}
example/ck_tile/19_elementwise/elementwise_api.hpp
0 → 100644
View file @
e34eeb6d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/elementwise_unary.hpp"
#include <string>
struct
elementwise_trait
{
std
::
string
input_type
;
// input type
std
::
string
acc_type
;
// type to do intermediate computation
std
::
string
output_type
;
// type to store out
std
::
string
op
;
};
struct
elementwise_kargs
:
public
ck_tile
::
ElementwiseUnaryHostArgs
{
};
float
elementwise
(
elementwise_trait
t
,
elementwise_kargs
a
,
ck_tile
::
stream_config
s
);
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary.hpp
0 → 100644
View file @
e34eeb6d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp"
#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp"
#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp"
#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_problem.hpp"
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
0 → 100644
View file @
e34eeb6d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/host/hip_check_error.hpp"
#include <string>
#include <type_traits>
namespace
ck_tile
{
struct
ElementwiseUnaryHostArgs
{
const
void
*
p_input
;
void
*
p_output
;
uint64_t
num_pixels
;
};
template
<
typename
Pipeline_
>
struct
ElementwiseUnaryKernel
{
using
Pipeline
=
remove_cvref_t
<
Pipeline_
>
;
using
Problem
=
remove_cvref_t
<
typename
Pipeline
::
Problem
>
;
using
InputType
=
typename
Problem
::
InputType
;
using
OutputType
=
typename
Problem
::
OutputType
;
struct
ElementwiseUnaryKargs
{
const
void
*
p_input
;
void
*
p_output
;
uint64_t
num_pixels
;
};
using
Kargs
=
ElementwiseUnaryKargs
;
using
Hargs
=
ElementwiseUnaryHostArgs
;
CK_TILE_HOST
static
constexpr
auto
GridSize
(
const
Hargs
&
h
)
{
constexpr
index_t
issues_per_block
=
Problem
::
BlockSize
*
Problem
::
Chunks
*
Problem
::
VectorSize
;
index_t
grids
=
static_cast
<
index_t
>
((
h
.
num_pixels
+
issues_per_block
-
1
)
/
issues_per_block
);
return
dim3
(
grids
);
}
CK_TILE_HOST
static
constexpr
auto
MakeKargs
(
const
Hargs
&
h
)
{
Kargs
k
;
k
.
p_input
=
h
.
p_input
;
k
.
p_output
=
h
.
p_output
;
k
.
num_pixels
=
h
.
num_pixels
;
return
k
;
}
CK_TILE_HOST_DEVICE
static
constexpr
auto
BlockSize
()
{
return
Problem
::
BlockSize
;
}
CK_TILE_DEVICE
void
operator
()(
Kargs
kargs
)
const
{
uint64_t
block_base
=
static_cast
<
uint64_t
>
(
blockIdx
.
x
)
*
Problem
::
BlockSize
*
Problem
::
VectorSize
;
uint64_t
pixels_rem
=
kargs
.
num_pixels
-
block_base
;
const
auto
input_window
=
[
&
]()
{
const
InputType
*
p_input
=
reinterpret_cast
<
const
InputType
*>
(
kargs
.
p_input
)
+
block_base
;
auto
tmp
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
p_input
,
make_tuple
(
static_cast
<
index_t
>
(
pixels_rem
)),
number
<
Problem
::
VectorSize
>
{});
return
make_tile_window
(
tmp
,
make_tuple
(
number
<
Problem
::
BlockSize
*
Problem
::
VectorSize
>
{}),
{
0
});
}();
auto
output_window
=
[
&
]()
{
OutputType
*
p_output
=
reinterpret_cast
<
OutputType
*>
(
kargs
.
p_output
)
+
block_base
;
auto
tmp
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
p_output
,
make_tuple
(
static_cast
<
index_t
>
(
pixels_rem
)),
number
<
Problem
::
VectorSize
>
{});
return
make_tile_window
(
tmp
,
make_tuple
(
number
<
Problem
::
BlockSize
*
Problem
::
VectorSize
>
{}),
{
0
});
}();
index_t
loop_stride
=
__builtin_amdgcn_readfirstlane
(
gridDim
.
x
*
Problem
::
BlockSize
*
Problem
::
VectorSize
);
Pipeline
{}(
input_window
,
output_window
,
loop_stride
);
}
};
}
// namespace ck_tile
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
0 → 100644
View file @
e34eeb6d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp"
#include <string>
#include <type_traits>
#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW 1
#endif
namespace
ck_tile
{
template
<
typename
Problem_
,
typename
Policy_
=
ElementwiseUnaryPolicy
>
struct
ElementwiseUnaryipeline
{
// TODO: this kernel only support warp per row
using
Problem
=
remove_cvref_t
<
Problem_
>
;
using
Policy
=
remove_cvref_t
<
Policy_
>
;
using
UnaryFunctor
=
typename
Problem
::
UnaryFunctor
;
template
<
typename
InputWindow
,
typename
OutputWindow
>
CK_TILE_DEVICE
auto
operator
()(
const
InputWindow
&
inp_window
,
OutputWindow
&
out_window
,
index_t
loop_stride
)
{
auto
inp_win
=
make_tile_window
(
inp_window
.
get_bottom_tensor_view
(),
inp_window
.
get_window_lengths
(),
inp_window
.
get_window_origin
(),
Policy
::
template
MakeInputDistribution
<
Problem
>());
auto
out_win
=
make_tile_window
(
out_window
.
get_bottom_tensor_view
(),
out_window
.
get_window_lengths
(),
out_window
.
get_window_origin
(),
Policy
::
template
MakeOutputDistribution
<
Problem
>());
static_for
<
0
,
Problem
::
Chunks
,
1
>
{}([
&
](
auto
)
{
auto
x
=
load_tile
(
inp_win
);
auto
y
=
make_static_distributed_tensor
<
typename
Problem
::
OutputType
>
(
x
.
get_tile_distribution
());
tile_elementwise_inout
(
UnaryFunctor
{},
y
,
x
);
store_tile
(
out_win
,
y
);
move_tile_window
(
inp_win
,
{
loop_stride
});
move_tile_window
(
out_win
,
{
loop_stride
});
});
}
};
}
// namespace ck_tile
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_policy.hpp
0 → 100644
View file @
e34eeb6d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
struct
ElementwiseUnaryPolicy
{
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeInputDistribution
()
{
// TODO: Y dim must have one dim that is not reduced
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
Problem
::
IssuesPerRow
,
Problem
::
WarpsPerBlock
,
Problem
::
LanesPerRow
,
Problem
::
VectorSize
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
1
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
2
>>
,
sequence
<
1
,
1
>
,
sequence
<
0
,
3
>>
{});
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeOutputDistribution
()
{
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
Problem
::
IssuesPerRow
,
Problem
::
WarpsPerBlock
,
Problem
::
LanesPerRow
,
Problem
::
VectorSize
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
1
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
2
>>
,
sequence
<
1
,
1
>
,
sequence
<
0
,
3
>>
{});
}
};
}
// namespace ck_tile
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_problem.hpp
0 → 100644
View file @
e34eeb6d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include <string>
#include <type_traits>
namespace
ck_tile
{
template
<
typename
InputType_
,
typename
OutputType_
,
typename
UnaryFunctor_
,
index_t
BytesPerIssue_
=
sizeof
(
InputType_
),
// this is input
index_t
Chunks_
=
8
,
index_t
BlockSize_
=
256
>
struct
ElementwiseUnaryWarpPerRowProblem
{
// TODO: this kernel only support warp per row
using
InputType
=
remove_cvref_t
<
InputType_
>
;
using
OutputType
=
remove_cvref_t
<
OutputType_
>
;
using
UnaryFunctor
=
remove_cvref_t
<
UnaryFunctor_
>
;
static
constexpr
index_t
Chunks
=
Chunks_
;
static
constexpr
index_t
BytesPerIssue
=
BytesPerIssue_
;
static
constexpr
index_t
BlockSize
=
BlockSize_
;
static
constexpr
index_t
WarpSize
=
get_warp_size
();
static_assert
(
BytesPerIssue
%
sizeof
(
InputType
)
==
0
);
static
constexpr
index_t
VectorSize
=
BytesPerIssue
/
sizeof
(
InputType
);
static
constexpr
index_t
LanesPerRow
=
WarpSize
;
static
constexpr
index_t
WarpsPerBlock
=
BlockSize
/
LanesPerRow
;
static
constexpr
index_t
IssuesPerRow
=
1
;
#if 0
static_assert(Experts % VectorSize == 0);
static constexpr index_t LanesPerRow = min(Experts / VectorSize, WarpSize);
static_assert(WarpSize % LanesPerRow == 0);
static constexpr index_t RowsPerWarpPerColIssue = WarpSize / LanesPerRow;
static constexpr index_t RowsPerWarp = IssuesPerCol * RowsPerWarpPerColIssue;
static constexpr index_t IssuesPerRow = Experts / (LanesPerRow * VectorSize);
static constexpr index_t WarpsPerBlock = BlockSize / WarpSize;
static constexpr index_t RowsPerBlock = RowsPerWarp * WarpsPerBlock;
#endif
};
}
// namespace ck_tile
example/ck_tile/19_elementwise/include/ck_tile/remod.py
0 → 100644
View file @
e34eeb6d
import
pathlib
from
pathlib
import
Path
import
subprocess
import
os
import
copy
NS
=
'ck_tile'
OPS
=
'ops'
OPS_COMMON
=
'common'
# common header will be duplicated into ops/* other module
HEADER_COMMON
=
"""// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
\n
"""
# aa/bb/cc/file.hpp -> (aa, bb, cc, file.hpp)
def
get_module
(
f
,
level
=
0
):
all_parts
=
f
.
parts
return
str
(
all_parts
[
level
])
all_files
=
[]
for
p
in
sorted
(
Path
(
"./"
).
rglob
(
"*"
)):
if
p
.
suffix
==
'.hpp'
:
all_files
.
append
(
pathlib
.
PurePath
(
p
))
class
submodule_t
:
def
__init__
(
self
):
self
.
m
=
dict
()
def
push
(
self
,
f
):
if
len
(
f
.
parents
)
!=
1
:
# ignore ./xxx.hpp
mod
=
get_module
(
f
)
if
mod
==
OPS
:
if
mod
not
in
self
.
m
.
keys
():
self
.
m
[
mod
]
=
dict
()
mod2
=
get_module
(
f
,
1
)
if
Path
(
mod2
).
suffix
!=
'.hpp'
:
# ignore ops/xxx.hpp
if
mod2
not
in
self
.
m
[
mod
].
keys
():
self
.
m
[
mod
][
mod2
]
=
list
()
self
.
m
[
mod
][
mod2
].
append
(
f
)
else
:
if
mod
not
in
self
.
m
.
keys
():
self
.
m
[
mod
]
=
list
()
self
.
m
[
mod
].
append
(
f
)
def
gen
(
self
):
def
gen_header
(
hpath
,
include_list
):
# print(hpath)
if
os
.
path
.
exists
(
str
(
hpath
)):
os
.
remove
(
str
(
hpath
))
with
hpath
.
open
(
'w'
)
as
f
:
f
.
write
(
HEADER_COMMON
)
f
.
write
(
'#pragma once
\n
'
)
f
.
write
(
'
\n
'
)
for
individual_header
in
include_list
:
header_path
=
NS
+
'/'
+
str
(
individual_header
)
f
.
write
(
f
'#include
\"
{
header_path
}
\"\n
'
)
# f.write('\n') # otherwise clang-format will complain
# print(self.m)
# restructure common
for
k
,
v
in
self
.
m
.
items
():
if
k
==
OPS
and
OPS_COMMON
in
v
.
keys
():
common_list
=
copy
.
deepcopy
(
v
[
OPS_COMMON
])
# v.pop(OPS_COMMON)
for
km
in
v
.
keys
():
if
km
!=
OPS_COMMON
:
v
[
km
].
extend
(
common_list
)
for
k
,
v
in
self
.
m
.
items
():
if
k
==
OPS
:
for
km
,
kv
in
v
.
items
():
gen_header
(
Path
(
k
)
/
(
f
'
{
km
}
.hpp'
),
kv
)
else
:
gen_header
(
Path
(
f
'
{
k
}
.hpp'
),
v
)
submodule
=
submodule_t
()
# formatting
for
x
in
all_files
:
subprocess
.
Popen
(
f
'dos2unix
{
str
(
x
)
}
'
,
shell
=
True
)
cmd
=
f
'clang-format-12 -style=file -i
{
str
(
x
)
}
'
#for xp in x.parents:
#print(get_file_base(x))
subprocess
.
Popen
(
cmd
,
shell
=
True
)
submodule
.
push
(
x
)
submodule
.
gen
()
#print(all_files)
example/ck_tile/CMakeLists.txt
View file @
e34eeb6d
...
@@ -6,3 +6,4 @@ add_subdirectory(01_fmha)
...
@@ -6,3 +6,4 @@ add_subdirectory(01_fmha)
add_subdirectory
(
02_layernorm2d
)
add_subdirectory
(
02_layernorm2d
)
add_subdirectory
(
03_gemm
)
add_subdirectory
(
03_gemm
)
add_subdirectory
(
06_permute
)
add_subdirectory
(
06_permute
)
add_subdirectory
(
19_elementwise
)
include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
View file @
e34eeb6d
...
@@ -1079,6 +1079,15 @@ struct ConvScaleRelu
...
@@ -1079,6 +1079,15 @@ struct ConvScaleRelu
float
scale_out_
;
float
scale_out_
;
};
};
template
<
typename
DstType
,
typename
SrcType
>
struct
Cast
{
template
<
typename
T
>
CK_TILE_HOST_DEVICE
void
operator
()(
DstType
&
y
,
const
SrcType
&
x
)
const
{
y
=
ck_tile
::
type_convert
<
DstType
>
(
x
);
};
};
// support fastconvert of int8 to fp16
// support fastconvert of int8 to fp16
#if 0
#if 0
template <typename InputDataType, typename OutputDataType, index_t RegPackNumber>
template <typename InputDataType, typename OutputDataType, index_t RegPackNumber>
...
...
include/ck_tile/ops/topk_softmax/pipeline/topk_softmax_warp_per_row_pipeline.hpp
View file @
e34eeb6d
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
#include <type_traits>
#include <type_traits>
#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
#ifndef TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
0
#define TOPK_SOFTMAX_USE_RAW_TILE_WINDOW
1
#endif
#endif
namespace
ck_tile
{
namespace
ck_tile
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment