Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
1e95a6e2
Commit
1e95a6e2
authored
Sep 13, 2024
by
carlushuang
Browse files
update script
parent
50ba9c44
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
140 additions
and
7 deletions
+140
-7
example/ck_tile/19_elementwise/elementwise.cpp
example/ck_tile/19_elementwise/elementwise.cpp
+67
-2
example/ck_tile/19_elementwise/elementwise_api.cpp
example/ck_tile/19_elementwise/elementwise_api.cpp
+45
-5
example/ck_tile/19_elementwise/elementwise_api.hpp
example/ck_tile/19_elementwise/elementwise_api.hpp
+1
-0
example/ck_tile/19_elementwise/script/bench.sh
example/ck_tile/19_elementwise/script/bench.sh
+27
-0
No files found.
example/ck_tile/19_elementwise/elementwise.cpp
View file @
1e95a6e2
...
...
@@ -17,6 +17,10 @@
#define TEST_ELEMENTWISE_VERBOSE 1
#endif
#ifndef TEST_ELEMENTWISE_HIPGRAPH
#define TEST_ELEMENTWISE_HIPGRAPH 1
#endif
template
<
typename
T
>
void
dump_host_tensor_2d
(
const
ck_tile
::
HostTensor
<
T
>&
x
)
{
...
...
@@ -149,6 +153,13 @@ bool test_cast(ck_tile::ArgParser args)
t_
.
input_type
=
input_prec
;
t_
.
output_type
=
output_prec
;
t_
.
op
=
std
::
string
(
"cast"
);
t_
.
num_cu
=
[
&
]()
{
hipDeviceProp_t
dev_prop
;
hipDevice_t
dev
;
HIP_CHECK_ERROR
(
hipGetDevice
(
&
dev
));
HIP_CHECK_ERROR
(
hipGetDeviceProperties
(
&
dev_prop
,
dev
));
return
dev_prop
.
multiProcessorCount
;
}();
return
t_
;
}();
...
...
@@ -161,11 +172,65 @@ bool test_cast(ck_tile::ArgParser args)
}();
#if TEST_ELEMENTWISE_VERBOSE
ck_tile
::
stream_config
sc
{
nullptr
,
true
};
#if !TEST_ELEMENTWISE_HIPGRAPH
ck_tile
::
stream_config
sc
{
nullptr
,
true
,
0
,
20
,
50
,
false
};
// ck_tile::stream_config sc{nullptr};
auto
ms
=
elementwise
(
trait
,
karg
,
sc
);
#else
float
ms
=
0
;
{
int
repeat
=
50
;
int
warpup
=
20
;
hipGraph_t
graph_
;
hipStream_t
stream_
;
HIP_CHECK_ERROR
(
hipStreamCreate
(
&
stream_
));
ck_tile
::
stream_config
sc
{
stream_
};
HIP_CHECK_ERROR
(
hipStreamBeginCapture
(
sc
.
stream_id_
,
hipStreamCaptureModeGlobal
));
for
(
int
i_r
=
0
;
i_r
<
repeat
;
i_r
++
)
{
elementwise
(
trait
,
karg
,
sc
);
}
HIP_CHECK_ERROR
(
hipStreamEndCapture
(
sc
.
stream_id_
,
&
graph_
));
hipGraphExec_t
instance_
;
HIP_CHECK_ERROR
(
hipGraphInstantiate
(
&
instance_
,
graph_
,
nullptr
,
nullptr
,
0
));
hipEvent_t
start_
,
stop_
;
HIP_CHECK_ERROR
(
hipEventCreate
(
&
start_
));
HIP_CHECK_ERROR
(
hipEventCreate
(
&
stop_
));
//warm-up
for
(
int
i_r
=
0
;
i_r
<
warpup
;
i_r
++
)
{
elementwise
(
trait
,
karg
,
sc
);
}
HIP_CHECK_ERROR
(
hipDeviceSynchronize
());
HIP_CHECK_ERROR
(
hipEventRecord
(
start_
,
sc
.
stream_id_
));
HIP_CHECK_ERROR
(
hipGraphLaunch
(
instance_
,
sc
.
stream_id_
));
HIP_CHECK_ERROR
(
hipEventRecord
(
stop_
,
sc
.
stream_id_
));
HIP_CHECK_ERROR
(
hipEventSynchronize
(
stop_
));
HIP_CHECK_ERROR
(
hipGetLastError
());
HIP_CHECK_ERROR
(
hipGraphDestroy
(
graph_
));
float
total_time
=
0
;
HIP_CHECK_ERROR
(
hipEventElapsedTime
(
&
total_time
,
start_
,
stop_
));
ms
=
total_time
/
repeat
;
}
#endif
auto
gbps
=
[
&
](){
double
total_bytes
=
num_pixels
*
sizeof
(
SrcType
)
+
num_pixels
*
sizeof
(
DstType
);
return
total_bytes
/
1.E6
/
ms
;
}();
printf
(
"[cast] %s->%s, n:%lu,
ms:%f
, "
,
input_prec
.
c_str
(),
output_prec
.
c_str
(),
num_pixels
,
ms
);
"[cast] %s->%s, n:%lu,
ns:%f(ms:%f), %.2fGB/s
, "
,
input_prec
.
c_str
(),
output_prec
.
c_str
(),
num_pixels
,
ms
*
1e6
,
ms
,
gbps
);
if
(
ms
<
0
)
printf
(
"not supported
\n
"
);
fflush
(
stdout
);
...
...
example/ck_tile/19_elementwise/elementwise_api.cpp
View file @
1e95a6e2
...
...
@@ -11,12 +11,12 @@ struct Cast
};
}
// namespace impl
#define DISPATCH_E
LEMENTWISE
_CAST(d_type_, s_type_, byte_per_issue_, chunks_) \
#define DISPATCH_E_CAST
_
(d_type_, s_type_, byte_per_issue_, chunks_
, bs_
) \
using src_t = s_type_; \
using dst_t = d_type_; \
using u_fun = typename impl::Cast; \
using problem = \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_>; \
ck_tile::ElementwiseUnaryWarpPerRowProblem<src_t, dst_t, u_fun, byte_per_issue_, chunks_
, bs_
>; \
using pipeline = ck_tile::ElementwiseUnaryipeline<problem>; \
using kernel = ck_tile::ElementwiseUnaryKernel<pipeline>; \
\
...
...
@@ -35,11 +35,51 @@ float elementwise(elementwise_trait t, elementwise_kargs a, ck_tile::stream_conf
{
if
(
t
.
output_type
==
"fp32"
&&
t
.
input_type
==
"fp16"
)
{
DISPATCH_ELEMENTWISE_CAST
(
float
,
ck_tile
::
fp16_t
,
8
*
sizeof
(
ck_tile
::
fp16_t
),
8
)
constexpr
int
eb
=
sizeof
(
ck_tile
::
fp16_t
);
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
64
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
64
)
}
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
128
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
128
)
}
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
3
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
256
)
}
else
if
(
a
.
num_pixels
%
4
==
0
)
{
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
4
*
8
))
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
4
*
eb
,
1
,
256
)
}
else
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
4
*
eb
,
8
,
256
)
}
}
else
{
DISPATCH_E_CAST_
(
float
,
ck_tile
::
fp16_t
,
1
*
eb
,
1
,
256
)
}
}
else
if
(
t
.
output_type
==
"fp16"
&&
t
.
input_type
==
"fp32"
)
{
DISPATCH_ELEMENTWISE_CAST
(
ck_tile
::
fp16_t
,
float
,
4
*
sizeof
(
float
),
8
)
constexpr
int
eb
=
sizeof
(
float
);
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
64
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
64
)
}
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
128
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
128
)
}
else
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
3
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
256
)
}
else
if
(
a
.
num_pixels
%
4
==
0
)
{
if
(
a
.
num_pixels
<
(
static_cast
<
uint64_t
>
(
t
.
num_cu
)
*
256
*
4
*
8
))
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
4
*
eb
,
1
,
256
)
}
else
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
4
*
eb
,
8
,
256
)
}
}
else
{
DISPATCH_E_CAST_
(
ck_tile
::
fp16_t
,
float
,
1
*
eb
,
1
,
256
)
}
}
}
return
rtn
;
...
...
example/ck_tile/19_elementwise/elementwise_api.hpp
View file @
1e95a6e2
...
...
@@ -13,6 +13,7 @@ struct elementwise_trait
std
::
string
acc_type
;
// type to do intermediate computation
std
::
string
output_type
;
// type to store out
std
::
string
op
;
int
num_cu
;
};
struct
elementwise_kargs
:
public
ck_tile
::
ElementwiseUnaryHostArgs
...
...
example/ck_tile/19_elementwise/script/bench.sh
0 → 100644
View file @
1e95a6e2
#!/bin/sh
EXE
=
./build/bin/tile_example_elementwise
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
2043904
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
992256
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
846304
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
434176
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
159424
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
98304
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
73728
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
17408
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
512
$EXE
-pr_i
=
fp16
-pr_o
=
fp32
-n
=
256
echo
"-------------------------------------"
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
2043904
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
992256
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
846304
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
434176
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
159424
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
98304
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
73728
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
17408
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
512
$EXE
-pr_i
=
fp32
-pr_o
=
fp16
-n
=
256
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment