Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
eb44e047
Unverified
Commit
eb44e047
authored
Jul 08, 2024
by
Andriy Roshchenko
Committed by
GitHub
Jul 08, 2024
Browse files
Add ckProfiler support for forward 3D convolutions with OUT element-wise operations. (#1354)
parent
75e622f0
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
604 additions
and
1 deletion
+604
-1
library/include/ck/library/utility/host_tensor.hpp
library/include/ck/library/utility/host_tensor.hpp
+9
-1
profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp
...e/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp
+352
-0
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+3
-0
profiler/src/profile_grouped_conv_fwd_outelementop.cpp
profiler/src/profile_grouped_conv_fwd_outelementop.cpp
+220
-0
script/profile_grouped_conv_fwd_outelementop.sh
script/profile_grouped_conv_fwd_outelementop.sh
+20
-0
No files found.
library/include/ck/library/utility/host_tensor.hpp
View file @
eb44e047
...
@@ -43,7 +43,15 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
...
@@ -43,7 +43,15 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
first
=
false
;
first
=
false
;
else
else
os
<<
delim
;
os
<<
delim
;
os
<<
static_cast
<
T
>
(
v
);
if
constexpr
(
std
::
is_same_v
<
T
,
ck
::
f8_t
>
||
std
::
is_same_v
<
T
,
ck
::
bf8_t
>
)
{
os
<<
ck
::
type_convert
<
float
>
(
v
);
}
else
{
os
<<
static_cast
<
T
>
(
v
);
}
}
}
return
os
;
return
os
;
}
}
...
...
profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp
0 → 100644
View file @
eb44e047
#pragma once
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convinvscale.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
DataType
>
inline
constexpr
double
get_rtol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1e-6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5e-2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
1e-1
;
// 240 and 224 are acceptable
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
1.5e-1
;
// 57344 and 49152 are acceptable
}
else
{
return
1e-3
;
}
}
template
<
typename
DataType
>
inline
constexpr
double
get_atol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1e-6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1e-3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5e-2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1e-1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
16.1
;
// 240 and 224 are acceptable
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
8192.1
;
// 57344 and 49152 are acceptable
}
else
{
return
1e-3
;
}
}
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
OutElementOp
,
typename
AComputeType
=
InDataType
,
typename
BComputeType
=
AComputeType
>
bool
profile_grouped_conv_fwd_outelementop_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
auto
pass
=
true
;
// return status
using
CShuffleDataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
in_g_n_c_wis_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InLayout
>
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
ck
::
utils
::
conv
::
make_weight_host_tensor_descriptor_g_k_c_xs_packed
<
WeiLayout
>
(
conv_param
);
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
Tensor
<
InDataType
>
input
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
weight
(
wei_g_k_c_xs_desc
);
Tensor
<
CShuffleDataType
>
c
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
host_output
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
device_output
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"input: "
<<
input
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"weight: "
<<
weight
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
host_output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
weight
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
1
,
1
});
break
;
default:
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
5.0
,
5.0
});
weight
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
1.0
,
1.0
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weight
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
device_output
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weight
.
mData
.
data
());
// random scale values
auto
scale_in
=
type_convert
<
float
>
(
type_convert
<
f8_t
>
(
2.0
f
*
float
(
RAND_MAX
/
2
-
std
::
rand
())
/
float
(
RAND_MAX
)));
auto
scale_wei
=
type_convert
<
float
>
(
type_convert
<
f8_t
>
(
2.0
f
*
float
(
RAND_MAX
/
2
-
std
::
rand
())
/
float
(
RAND_MAX
)));
auto
scale_out
=
type_convert
<
float
>
(
type_convert
<
f8_t
>
(
2.0
f
*
float
(
RAND_MAX
/
2
-
std
::
rand
())
/
float
(
RAND_MAX
)));
// initialize out_element_op for each iteration
const
auto
out_element_op
=
OutElementOp
{
scale_in
,
scale_wei
,
scale_out
};
std
::
cout
<<
"scale_in: "
<<
scale_in
<<
std
::
endl
;
std
::
cout
<<
"scale_wei: "
<<
scale_wei
<<
std
::
endl
;
std
::
cout
<<
"scale_out: "
<<
scale_out
<<
std
::
endl
;
// run reference op
if
(
do_verification
)
{
std
::
cout
<<
"
\n
Verifying algorithm against reference convolution..."
<<
std
::
endl
;
std
::
cout
<<
"
\t
Using (rel_tol,abs_tol) = ("
<<
std
::
setprecision
(
7
)
<<
get_rtol
<
OutDataType
>
()
<<
", "
<<
get_atol
<
OutDataType
>
()
<<
")"
<<
std
::
endl
;
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
WeiDataType
,
CShuffleDataType
,
InElementOp
,
WeiElementOp
,
PassThrough
>
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weight
,
c
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
PassThrough
{});
c
.
SetZero
();
ref_invoker
.
Run
(
ref_argument
);
host_output
.
ForEach
([
&
](
auto
&
,
auto
idx
)
{
out_element_op
(
host_output
(
idx
),
c
(
idx
));
});
}
std
::
string
best_op_name
;
float
best_avg_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
auto
run_impl
=
[
&
](
auto
&
op_ptr
,
auto
&
argument_ptr
)
{
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
// re-init output to zero before profiling next kernel
out_device_buf
.
SetZero
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
conv_param
.
GetFlops
();
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
device_output
,
host_output
,
"Error: Device and Host results do not match!"
,
get_rtol
<
OutDataType
>
(),
get_atol
<
OutDataType
>
());
if
(
do_log
)
{
LogRangeAsType
<
InDataType
>
(
std
::
cout
<<
"input : "
,
input
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
WeiDataType
>
(
std
::
cout
<<
"weight: "
,
weight
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
OutDataType
>
(
std
::
cout
<<
"host_output : "
,
host_output
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
OutDataType
>
(
std
::
cout
<<
"device_output: "
,
device_output
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
else
{
std
::
cout
<<
op_ptr
->
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
}
};
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
ck
::
Tuple
<>
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
AComputeType
,
BComputeType
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"ckProfiler found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
{},
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
{},
{},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
run_impl
(
op_ptr
,
argument_ptr
);
}
std
::
cout
<<
"Best configuration parameters:"
<<
"
\n
name: "
<<
best_op_name
<<
"
\n
avg_time: "
<<
best_avg_time
<<
"
\n
tflops: "
<<
best_tflops
<<
"
\n
GB/s: "
<<
best_gb_per_sec
<<
std
::
endl
;
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/src/CMakeLists.txt
View file @
eb44e047
...
@@ -57,6 +57,7 @@ if(GPU_TARGETS MATCHES "gfx9")
...
@@ -57,6 +57,7 @@ if(GPU_TARGETS MATCHES "gfx9")
list
(
APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp
)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp
)
list
(
APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp
)
list
(
APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp
)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd.cpp
)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_fwd_outelementop.cpp
)
endif
()
endif
()
...
@@ -134,6 +135,8 @@ if(GPU_TARGETS MATCHES "gfx9")
...
@@ -134,6 +135,8 @@ if(GPU_TARGETS MATCHES "gfx9")
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_fwd_convscale_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_fwd_convinvscale_instance
)
endif
()
endif
()
if
(
GPU_TARGETS MATCHES
"gfx9"
OR GPU_TARGETS MATCHES
"gfx11"
OR GPU_TARGETS MATCHES
"gfx12"
)
if
(
GPU_TARGETS MATCHES
"gfx9"
OR GPU_TARGETS MATCHES
"gfx11"
OR GPU_TARGETS MATCHES
"gfx12"
)
...
...
profiler/src/profile_grouped_conv_fwd_outelementop.cpp
0 → 100644
View file @
eb44e047
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "profiler/profile_grouped_conv_fwd_outelementop_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "profiler_operation_registry.hpp"
#include <iostream>
enum
struct
ConvLayout
{
GNHWC_GKYXC_GNHWK
=
0
,
NHWGC_GKYXC_NHWGK
=
1
};
enum
struct
OutElementOp
{
ConvScale
=
0
,
ConvInvScale
=
1
};
enum
struct
ConvDataType
{
F8_F8_F8
=
0
,
BF8_BF8_F8
=
1
,
F8_BF8_F8
=
2
,
BF8_F8_F8
=
3
};
#define OP_NAME "grouped_conv_fwd_outelementop"
#define OP_DESC "Grouped Convolution Forward+Elementwise Operation"
static
void
print_helper_msg
()
{
// clang-format off
std
::
cout
<<
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
<<
"arg2: data type (0: Input fp8, Weight fp8, Output fp8
\n
"
<<
" 1: Input bf8, Weight bf8, Output fp8
\n
"
<<
" 2: Input fp8, Weight bf8, Output fp8
\n
"
<<
" 3: Input bf8, Weight fp8, Output fp8)
\n
"
<<
"arg3: element-wise operation (0: ConvScale
\n
"
<<
" 1: ConvInvScale)
\n
"
<<
"arg4: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]
\n
"
<<
" 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])
\n
"
<<
"arg5: verification (0: no, 1: yes)
\n
"
<<
"arg6: initialization (0: no init, 1: integer value, 2: decimal value)
\n
"
<<
"arg7: print tensor value (0: no; 1: yes)
\n
"
<<
"arg8: time kernel (0: no, 1: yes)
\n
"
<<
ck
::
utils
::
conv
::
get_conv_param_parser_helper_msg
()
<<
std
::
endl
;
// clang-format on
}
int
grouped_conv_fwd_outelementop
(
int
argc
,
char
*
argv
[])
{
// 9 total, 1 for num_dim_spatial
if
(
argc
<
10
)
{
print_helper_msg
();
return
1
;
}
const
auto
data_type
=
static_cast
<
ConvDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
op
=
static_cast
<
OutElementOp
>
(
std
::
stoi
(
argv
[
3
]));
const
auto
layout
=
static_cast
<
ConvLayout
>
(
std
::
stoi
(
argv
[
4
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
8
]);
const
int
num_dim_spatial
=
std
::
stoi
(
argv
[
9
]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + 1 for argv[0]
if
(
argc
!=
8
+
1
+
4
+
6
*
num_dim_spatial
+
1
)
{
print_helper_msg
();
return
1
;
}
const
auto
params
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
10
,
argv
);
using
F8
=
ck
::
f8_t
;
using
BF8
=
ck
::
bf8_t
;
using
GKZYXC
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
NDHWGC
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
NDHWGK
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
using
ConvScale
=
ck
::
tensor_operation
::
element_wise
::
ConvScale
;
using
ConvInvScale
=
ck
::
tensor_operation
::
element_wise
::
ConvInvscale
;
constexpr
auto
I3
=
ck
::
Number
<
3
>
{};
auto
profile
=
[
&
](
auto
num_dim_spatial_tmp
,
auto
in_layout
,
auto
wei_layout
,
auto
out_layout
,
auto
in_type
,
auto
wei_type
,
auto
out_type
,
auto
out_element_op
,
auto
a_compute_type
,
auto
b_compute_type
)
{
constexpr
ck
::
index_t
NDimSpatial
=
num_dim_spatial_tmp
.
value
;
using
InLayout
=
decltype
(
in_layout
);
using
WeiLayout
=
decltype
(
wei_layout
);
using
OutLayout
=
decltype
(
out_layout
);
using
InDataType
=
decltype
(
in_type
);
using
WeiDataType
=
decltype
(
wei_type
);
using
OutDataType
=
decltype
(
out_type
);
using
OutElementOp
=
decltype
(
out_element_op
);
using
AComputeType
=
decltype
(
a_compute_type
);
using
BComputeType
=
decltype
(
b_compute_type
);
bool
pass
=
ck
::
profiler
::
profile_grouped_conv_fwd_outelementop_impl
<
NDimSpatial
,
InLayout
,
WeiLayout
,
OutLayout
,
InDataType
,
WeiDataType
,
OutDataType
,
OutElementOp
,
AComputeType
,
BComputeType
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
params
);
return
pass
?
0
:
1
;
};
if
(
num_dim_spatial
==
3
&&
layout
==
ConvLayout
::
NHWGC_GKYXC_NHWGK
)
{
if
(
op
==
OutElementOp
::
ConvScale
)
{
if
(
data_type
==
ConvDataType
::
F8_F8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F8
{},
F8
{},
F8
{},
ConvScale
{},
F8
{},
F8
{});
}
else
if
(
data_type
==
ConvDataType
::
BF8_BF8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
BF8
{},
BF8
{},
F8
{},
ConvScale
{},
BF8
{},
BF8
{});
}
else
if
(
data_type
==
ConvDataType
::
F8_BF8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F8
{},
BF8
{},
F8
{},
ConvScale
{},
F8
{},
BF8
{});
}
else
if
(
data_type
==
ConvDataType
::
BF8_F8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
BF8
{},
F8
{},
F8
{},
ConvScale
{},
BF8
{},
F8
{});
}
}
else
if
(
op
==
OutElementOp
::
ConvInvScale
)
{
if
(
data_type
==
ConvDataType
::
F8_F8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F8
{},
F8
{},
F8
{},
ConvInvScale
{},
F8
{},
F8
{});
}
else
if
(
data_type
==
ConvDataType
::
BF8_BF8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
BF8
{},
BF8
{},
F8
{},
ConvInvScale
{},
BF8
{},
BF8
{});
}
else
if
(
data_type
==
ConvDataType
::
F8_BF8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F8
{},
BF8
{},
F8
{},
ConvInvScale
{},
F8
{},
BF8
{});
}
else
if
(
data_type
==
ConvDataType
::
BF8_F8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
BF8
{},
F8
{},
F8
{},
ConvInvScale
{},
BF8
{},
F8
{});
}
}
}
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
REGISTER_PROFILER_OPERATION
(
OP_NAME
,
OP_DESC
,
grouped_conv_fwd_outelementop
);
script/profile_grouped_conv_fwd_outelementop.sh
0 → 100755
View file @
eb44e047
#!/bin/bash
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
DRIVER
=
"../build/bin/ckProfiler"
OP
=
$1
DATATYPE
=
$2
OUTELEMENTOP
=
$3
LAYOUT
=
$4
VERIFY
=
$5
INIT
=
$6
LOG
=
$7
TIME
=
$8
N
=
$9
####### op datatype OUTELEMENTOP layout verify init log time Ndims G N K C Z Y X Di Hi Wi Sz Sy Sx Dz Dy Dx Left Pz LeftPy LeftPx RightPz RightPy RightPx
$DRIVER
$OP
$DATATYPE
$OUTELEMENTOP
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
3 32
$N
96 96 3 3 3 28 28 28 1 1 1 1 1 1 1 1 1 1 1 1
$DRIVER
$OP
$DATATYPE
$OUTELEMENTOP
$LAYOUT
$VERIFY
$INIT
$LOG
$TIME
3 32
$N
192 192 3 3 3 28 28 28 1 1 1 1 1 1 1 1 1 1 1 1
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment