Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
e20ed766
Commit
e20ed766
authored
Sep 13, 2024
by
carlushuang
Browse files
format
parent
1e95a6e2
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
78 additions
and
57 deletions
+78
-57
example/ck_tile/19_elementwise/elementwise.cpp
example/ck_tile/19_elementwise/elementwise.cpp
+20
-13
example/ck_tile/19_elementwise/elementwise_api.cpp
example/ck_tile/19_elementwise/elementwise_api.cpp
+51
-35
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
...ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
+4
-7
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
...elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
+3
-2
No files found.
example/ck_tile/19_elementwise/elementwise.cpp
View file @
e20ed766
...
...
@@ -188,7 +188,8 @@ bool test_cast(ck_tile::ArgParser args)
ck_tile
::
stream_config
sc
{
stream_
};
HIP_CHECK_ERROR
(
hipStreamBeginCapture
(
sc
.
stream_id_
,
hipStreamCaptureModeGlobal
));
for
(
int
i_r
=
0
;
i_r
<
repeat
;
i_r
++
)
{
for
(
int
i_r
=
0
;
i_r
<
repeat
;
i_r
++
)
{
elementwise
(
trait
,
karg
,
sc
);
}
HIP_CHECK_ERROR
(
hipStreamEndCapture
(
sc
.
stream_id_
,
&
graph_
));
...
...
@@ -201,8 +202,9 @@ bool test_cast(ck_tile::ArgParser args)
HIP_CHECK_ERROR
(
hipEventCreate
(
&
start_
));
HIP_CHECK_ERROR
(
hipEventCreate
(
&
stop_
));
//warm-up
for
(
int
i_r
=
0
;
i_r
<
warpup
;
i_r
++
)
{
// warm-up
for
(
int
i_r
=
0
;
i_r
<
warpup
;
i_r
++
)
{
elementwise
(
trait
,
karg
,
sc
);
}
HIP_CHECK_ERROR
(
hipDeviceSynchronize
());
...
...
@@ -225,12 +227,17 @@ bool test_cast(ck_tile::ArgParser args)
ms
=
total_time
/
repeat
;
}
#endif
auto
gbps
=
[
&
](){
auto
gbps
=
[
&
]()
{
double
total_bytes
=
num_pixels
*
sizeof
(
SrcType
)
+
num_pixels
*
sizeof
(
DstType
);
return
total_bytes
/
1.E6
/
ms
;
}();
printf
(
"[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, "
,
input_prec
.
c_str
(),
output_prec
.
c_str
(),
num_pixels
,
ms
*
1e6
,
ms
,
gbps
);
printf
(
"[cast] %s->%s, n:%lu, ns:%f(ms:%f), %.2fGB/s, "
,
input_prec
.
c_str
(),
output_prec
.
c_str
(),
num_pixels
,
ms
*
1e6
,
ms
,
gbps
);
if
(
ms
<
0
)
printf
(
"not supported
\n
"
);
fflush
(
stdout
);
...
...
example/ck_tile/19_elementwise/elementwise_api.cpp
View file @
e20ed766
...
...
@@ -15,8 +15,8 @@ struct Cast
using src_t = s_type_; \
using dst_t = d_type_; \
using u_fun = typename impl::Cast; \
using problem =
\
ck_tile::
ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
using problem =
ck_tile::
\
ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_, bs_>; \
using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \
using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \
\
...
...
@@ -25,7 +25,9 @@ struct Cast
constexpr dim3 blocks = kernel::BlockSize(); \
\
float ave_time = ck_tile::launch_kernel( \
s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
s, \
ck_tile::make_kernel<blocks.x, 1>( \
kernel{}, grids, blocks, 0, kargs.p_input, kargs.p_output, kargs.num_pixels)); \
return ave_time;
float
elementwise
(
elementwise_trait
t
,
elementwise_kargs
a
,
ck_tile
::
stream_config
s
)
...
...
@@ -36,49 +38,63 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
if
(
t
.
output_type
==
"fp32"
&&
t
.
input_type
==
"fp16"
)
{
constexpr
int
eb
=
sizeof
(
ck_tile
::
fp16_t
);
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
64
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
64
)
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
64
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
64
)
}
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
128
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
128
)
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
128
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
128
)
}
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
3
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
256
)
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
3
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
256
)
}
else
if
(
a
.
num_pixels
%
4
==
0
)
{
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
4
*
8
))
{
else
if
(
a
.
num_pixels
%
4
==
0
)
{
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
4
*
8
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
4
*
eb
,
1
,
256
)
}
else
{
else
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
4
*
eb
,
8
,
256
)
}
}
else
{
else
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
256
)
}
}
else
if
(
t
.
output_type
==
"fp16"
&&
t
.
input_type
==
"fp32"
)
{
constexpr
int
eb
=
sizeof
(
float
);
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
64
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
64
)
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
64
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
64
)
}
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
128
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
128
)
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
128
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
128
)
}
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
3
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
256
)
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
3
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
256
)
}
else
if
(
a
.
num_pixels
%
4
==
0
)
{
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
4
*
8
))
{
else
if
(
a
.
num_pixels
%
4
==
0
)
{
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
4
*
8
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
4
*
eb
,
1
,
256
)
}
else
{
else
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
4
*
eb
,
8
,
256
)
}
}
else
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
256
)
else
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
256
)
}
}
}
...
...
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/kernel/elementwise_unary_kernel.hpp
View file @
e20ed766
...
...
@@ -57,17 +57,15 @@ struct ElementwiseUnaryKernel
CK_TILE_HOST_DEVICE
static
constexpr
auto
BlockSize
()
{
return
Problem
::
BlockSize
;
}
CK_TILE_DEVICE
void
operator
()(
const
void
*
p_input_
,
void
*
p_output_
,
uint64_t
num_pixels_
)
const
CK_TILE_DEVICE
void
operator
()(
const
void
*
p_input_
,
void
*
p_output_
,
uint64_t
num_pixels_
)
const
{
uint64_t
block_base
=
static_cast
<
uint64_t
>
(
blockIdx
.
x
)
*
Problem
::
BlockSize
*
Problem
::
VectorSize
;
uint64_t
pixels_rem
=
num_pixels_
-
block_base
;
const
auto
input_window
=
[
&
]()
{
const
InputType
*
p_input
=
reinterpret_cast
<
const
InputType
*>
(
p_input_
)
+
block_base
;
const
InputType
*
p_input
=
reinterpret_cast
<
const
InputType
*>
(
p_input_
)
+
block_base
;
auto
tmp
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
p_input
,
...
...
@@ -79,8 +77,7 @@ struct ElementwiseUnaryKernel
}();
auto
output_window
=
[
&
]()
{
OutputType
*
p_output
=
reinterpret_cast
<
OutputType
*>
(
p_output_
)
+
block_base
;
OutputType
*
p_output
=
reinterpret_cast
<
OutputType
*>
(
p_output_
)
+
block_base
;
auto
tmp
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
p_output
,
...
...
example/ck_tile/19_elementwise/include/ck_tile/ops/elementwise_unary/pipeline/elementwise_unary_pipeline.hpp
View file @
e20ed766
...
...
@@ -37,7 +37,8 @@ struct ElementwiseUnaryipeline
static_for
<
0
,
Problem
::
Chunks
,
1
>
{}([
&
](
auto
)
{
auto
x
=
load_tile
(
inp_win
);
auto
y
=
make_static_distributed_tensor
<
typename
Problem
::
OutputType
>
(
x
.
get_tile_distribution
());
auto
y
=
make_static_distributed_tensor
<
typename
Problem
::
OutputType
>
(
x
.
get_tile_distribution
());
tile_elementwise_inout
(
UnaryFunctor
{},
y
,
x
);
store_tile
(
out_win
,
y
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment