Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
11edd0f0
Commit
11edd0f0
authored
Jul 13, 2022
by
Chao Liu
Browse files
update conv fwd profiler
parent
0cb8ba92
Changes
20
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
886 additions
and
544 deletions
+886
-544
example/09_convnd_fwd/convnd_fwd_common.hpp
example/09_convnd_fwd/convnd_fwd_common.hpp
+39
-29
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+4
-3
include/ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp
...peration/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp
+26
-5
include/ck/utility/tuple.hpp
include/ck/utility/tuple.hpp
+21
-7
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
...ary/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+2
-0
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
..._operation_instance/device_operation_instance_factory.hpp
+16
-1
library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
...ary/tensor_operation_instance/gpu/convolution_forward.hpp
+1
-1
library/include/ck/library/utility/host_tensor.hpp
library/include/ck/library/utility/host_tensor.hpp
+8
-0
library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
...c/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
+0
-7
library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
...device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+64
-67
library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
.../device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+64
-67
library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
.../device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+65
-68
library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
...device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+52
-52
library/src/utility/convolution_parameter.cpp
library/src/utility/convolution_parameter.cpp
+4
-4
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+37
-37
profiler/include/profile_conv_fwd_impl.hpp
profiler/include/profile_conv_fwd_impl.hpp
+170
-135
profiler/include/profile_gemm_impl.hpp
profiler/include/profile_gemm_impl.hpp
+14
-15
profiler/src/profile_conv_fwd.cpp
profiler/src/profile_conv_fwd.cpp
+244
-0
profiler/src/profile_gemm.cpp
profiler/src/profile_gemm.cpp
+33
-27
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+22
-19
No files found.
example/09_convnd_fwd/convnd_fwd_common.hpp
View file @
11edd0f0
...
...
@@ -12,63 +12,73 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
ck
::
tensor_operation
::
device
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
int
arg_idx
,
char
*
const
argv
[])
{
ck
::
tensor_operation
::
device
::
ConvParams
params
;
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
const
ck
::
index_t
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
num_dim_spatial_
=
num_dim_spatial
;
params
.
N_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
input_left_pads
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
input_right_pads
(
num_dim_spatial
);
params
.
filter_spatial_lengths_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
filter_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_spatial_lengths_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_strides_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_strides
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_dilations_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_dilations
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_left_pads_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_left_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_right_pads_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_right_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
params
;
return
ck
::
tensor_operation
::
device
::
ConvParams
{
num_dim_spatial
,
N
,
K
,
C
,
filter_spatial_lengths
,
input_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
};
}
void
print_helper_msg
()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3: time kernel (0=n
0
, 1=yes)
\n
"
<<
"arg3: time kernel (0=n
o
, 1=yes)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
...
...
@@ -108,7 +118,7 @@ int run_conv_fwd_nhwc(const ck::tensor_operation::device::ConvParams& params,
Tensor
<
InDataType
>
input
(
f_nhwc_host_tensor_descriptor
(
params
.
N_
,
params
.
C_
,
params
.
input_spatial_lengths_
));
Tensor
<
WeiDataType
>
weight
s
(
Tensor
<
WeiDataType
>
weight
(
f_nhwc_host_tensor_descriptor
(
params
.
K_
,
params
.
C_
,
params
.
filter_spatial_lengths_
));
Tensor
<
OutDataType
>
host_output
(
f_nhwc_host_tensor_descriptor
(
params
.
N_
,
params
.
K_
,
params
.
GetOutputSpatialLengths
()));
...
...
@@ -116,7 +126,7 @@ int run_conv_fwd_nhwc(const ck::tensor_operation::device::ConvParams& params,
f_nhwc_host_tensor_descriptor
(
params
.
N_
,
params
.
K_
,
params
.
GetOutputSpatialLengths
()));
std
::
cout
<<
"input: "
<<
input
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"weight
s
: "
<<
weight
s
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"weight: "
<<
weight
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
host_output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
...
...
@@ -124,19 +134,19 @@ int run_conv_fwd_nhwc(const ck::tensor_operation::device::ConvParams& params,
case
0
:
break
;
case
1
:
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
weight
s
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
weight
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
default:
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
weight
s
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
weight
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weight
s
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weight
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
device_output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weight
s
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weight
.
mData
.
data
());
// do GEMM
auto
conv
=
DeviceConvNDFwdInstance
{};
...
...
@@ -181,7 +191,7 @@ int run_conv_fwd_nhwc(const ck::tensor_operation::device::ConvParams& params,
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weight
s
,
weight
,
host_output
,
params
.
conv_filter_strides_
,
params
.
conv_filter_dilations_
,
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
View file @
11edd0f0
...
...
@@ -20,6 +20,7 @@ static constexpr auto ConvFwdDefault =
template
<
ck
::
index_t
NumDimSpatial
>
using
DeviceConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceConvNdFwdNwcKxcNwk_Xdl
<
NumDimSpatial
,
// NumDimSpatial
InDataType
,
//
WeiDataType
,
//
OutDataType
,
//
...
...
@@ -28,7 +29,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
WeiElementOp
,
// Weights Elementwise Operation =
OutElementOp
,
// Output Elementwise Operation
ConvFwdDefault
,
// ConvForwardSpecialization
NumDimSpatial
,
// NumDimSpatial
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
...
...
@@ -77,7 +77,8 @@ int main(int argc, char* argv[])
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
ck
::
tensor_operation
::
device
::
ConvParams
params
;
ck
::
tensor_operation
::
device
::
ConvParams
params
{
2
,
128
,
256
,
192
,
{
3
,
3
},
{
71
,
71
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}};
if
(
argc
==
1
)
{
...
...
@@ -96,7 +97,7 @@ int main(int argc, char* argv[])
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
params
=
parse_conv_params
(
num_dim_spatial
,
5
,
argv
);
}
if
(
num_dim_spatial
==
1
)
...
...
include/ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp
View file @
11edd0f0
...
...
@@ -39,7 +39,8 @@ namespace device {
// 3D:
// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
//
template
<
typename
InDataType
,
template
<
ck
::
index_t
NumDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
AccDataType
,
...
...
@@ -47,7 +48,6 @@ template <typename InDataType,
typename
WeiElementwiseOperation
,
typename
OutElementwiseOperation
,
ConvolutionForwardSpecialization
ConvForwardSpecialization
,
ck
::
index_t
NumDimSpatial
,
ck
::
index_t
BlockSize
,
ck
::
index_t
MPerBlock
,
ck
::
index_t
NPerBlock
,
...
...
@@ -83,17 +83,38 @@ struct DeviceConvNdFwdNwcKxcNwk_Xdl
ck
::
Tuple
<
ck
::
tensor_layout
::
convolution
::
KXC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
>>
,
ck
::
tuple_element_t
<
NumDimSpatial
-
1
,
ck
::
Tuple
<
ck
::
tensor_layout
::
convolution
::
NWK
,
ck
::
tensor_layout
::
convolution
::
NHWK
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>>
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementwiseOperation
,
WeiElementwiseOperation
,
OutElementwiseOperation
>
{
using
Base
=
DeviceConvFwd
<
NumDimSpatial
,
ck
::
tuple_element_t
<
NumDimSpatial
-
1
,
ck
::
Tuple
<
ck
::
tensor_layout
::
convolution
::
NWC
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
NDHWC
>>
,
ck
::
tuple_element_t
<
NumDimSpatial
-
1
,
ck
::
Tuple
<
ck
::
tensor_layout
::
convolution
::
KXC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
>>
,
ck
::
tuple_element_t
<
NumDimSpatial
-
1
,
ck
::
Tuple
<
ck
::
tensor_layout
::
convolution
::
NWK
,
ck
::
tensor_layout
::
convolution
::
NHWK
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>>
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementwiseOperation
,
WeiElementwiseOperation
,
OutElementwiseOperation
>
{
OutElementwiseOperation
>
;
using
DeviceOp
=
DeviceConvNdFwdNwcKxcNwk_Xdl
;
using
ADataType
=
InDataType
;
...
...
include/ck/utility/tuple.hpp
View file @
11edd0f0
...
...
@@ -21,6 +21,8 @@ struct TupleElementKey
template
<
typename
Key
,
typename
Data
>
struct
TupleElementKeyData
{
using
DataType
=
Data
;
#if 0 // workaround compiler complaint about implicitly-deleted default constructor
__host__ __device__ constexpr TupleElementKeyData() = default;
#else
...
...
@@ -34,29 +36,40 @@ struct TupleElementKeyData
{
}
Data
mData
;
Data
Type
mData
;
};
// for read access of tuple element
template
<
typename
Key
,
typename
Data
>
__host__
__device__
constexpr
const
Data
&
get_tuple_element_data
(
const
TupleElementKeyData
<
Key
,
Data
>&
x
)
get_tuple_element_data
_reference
(
const
TupleElementKeyData
<
Key
,
Data
>&
x
)
{
return
static_cast
<
const
Data
&>
(
x
.
mData
);
}
// for write access of tuple element
template
<
typename
Key
,
typename
Data
>
__host__
__device__
constexpr
Data
&
get_tuple_element_data
(
TupleElementKeyData
<
Key
,
Data
>&
x
)
__host__
__device__
constexpr
Data
&
get_tuple_element_data_reference
(
TupleElementKeyData
<
Key
,
Data
>&
x
)
{
return
x
.
mData
;
}
// TODO: not sure the use of reference is correct
template
<
typename
Key
,
typename
Data
>
__host__
__device__
constexpr
Data
&&
get_tuple_element_data
(
TupleElementKeyData
<
Key
,
Data
>&&
x
)
__host__
__device__
constexpr
Data
&&
get_tuple_element_data_reference
(
TupleElementKeyData
<
Key
,
Data
>&&
x
)
{
return
static_cast
<
Data
&&>
(
x
.
mData
);
}
// for infering type of tuple element
template
<
typename
Key
,
typename
Data
>
__host__
__device__
constexpr
Data
get_tuple_element_data
(
const
TupleElementKeyData
<
Key
,
Data
>&
x
)
{
return
std
::
forward
(
x
.
mData
);
}
template
<
typename
Indices
,
typename
...
Xs
>
struct
TupleImpl
;
...
...
@@ -87,13 +100,13 @@ struct TupleImpl<Sequence<Is...>, Xs...> : TupleElementKeyData<TupleElementKey<I
template
<
index_t
I
>
__host__
__device__
constexpr
const
auto
&
GetElementDataByKey
(
TupleElementKey
<
I
>
)
const
{
return
get_tuple_element_data
<
TupleElementKey
<
I
>>
(
*
this
);
return
get_tuple_element_data
_reference
<
TupleElementKey
<
I
>>
(
*
this
);
}
template
<
index_t
I
>
__host__
__device__
constexpr
auto
&
GetElementDataByKey
(
TupleElementKey
<
I
>
)
{
return
get_tuple_element_data
<
TupleElementKey
<
I
>>
(
*
this
);
return
get_tuple_element_data
_reference
<
TupleElementKey
<
I
>>
(
*
this
);
}
};
...
...
@@ -185,7 +198,8 @@ struct Tuple<>
template
<
index_t
I
,
typename
TTuple
>
struct
tuple_element
{
using
type
=
decltype
(
TTuple
{}.
At
(
Number
<
I
>
{}));
// type should keep the cv/ref qualifier of original tuple element
using
type
=
decltype
(
detail
::
get_tuple_element_data
<
detail
::
TupleElementKey
<
I
>>
(
TTuple
{}));
};
template
<
index_t
I
,
typename
TTuple
>
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
View file @
11edd0f0
...
...
@@ -30,6 +30,8 @@ namespace host {
// operation.
// @tparam NumDimSpatial Number of spatial dimensions.
//
// FIXME: only support NDimSpatial = 1 to 3; only support NCHW and NHWC layout.
// Need to be more general
template
<
ck
::
index_t
NumDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
...
...
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
View file @
11edd0f0
...
...
@@ -10,7 +10,7 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
// aliasing, for commonly used type
// aliasing, for commonly used
data
type
using
F64
=
double
;
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
...
...
@@ -23,9 +23,24 @@ using F16_F16_TUPLE = ck::Tuple<F16, F16>;
using
F32_TUPLE
=
ck
::
Tuple
<
F32
>
;
// GEMM layout
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
// Conv layout
using
NWC
=
ck
::
tensor_layout
::
convolution
::
NWC
;
using
NHWC
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
NDHWC
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
KXC
=
ck
::
tensor_layout
::
convolution
::
KXC
;
using
KYXC
=
ck
::
tensor_layout
::
convolution
::
KYXC
;
using
KZYXC
=
ck
::
tensor_layout
::
convolution
::
KZYXC
;
using
NWK
=
ck
::
tensor_layout
::
convolution
::
NWK
;
using
NHWK
=
ck
::
tensor_layout
::
convolution
::
NHWK
;
using
NDHWK
=
ck
::
tensor_layout
::
convolution
::
NDHWK
;
// pointwise functor
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
Bilinear
=
ck
::
tensor_operation
::
element_wise
::
Bilinear
;
...
...
library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
View file @
11edd0f0
...
...
@@ -75,7 +75,7 @@ void add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
void
add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvFwd
<
2
,
NHWC
,
KYXC
,
NHWK
,
F
16
,
F
16
,
F
16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
DeviceConvFwd
<
2
,
NHWC
,
KYXC
,
NHWK
,
F
32
,
F
32
,
F
32
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
...
...
library/include/ck/library/utility/host_tensor.hpp
View file @
11edd0f0
...
...
@@ -256,6 +256,14 @@ struct Tensor
return
*
this
;
}
void
SetZero
()
{
for
(
auto
&
v
:
mData
)
{
v
=
T
{
0
};
}
}
template
<
typename
F
>
void
ForEach_impl
(
F
&&
f
,
std
::
vector
<
size_t
>&
idx
,
size_t
rank
)
{
...
...
library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
View file @
11edd0f0
...
...
@@ -5,8 +5,6 @@ set(DEVICE_CONV2D_FWD_INSTANCE_SOURCE
device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp;
device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp;
)
set
(
DEVICE_CONVND_2D_FWD_INSTANCE_SOURCE
device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp;
device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp;
device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp;
...
...
@@ -14,10 +12,5 @@ set(DEVICE_CONVND_2D_FWD_INSTANCE_SOURCE
)
add_library
(
device_conv2d_fwd_instance OBJECT
${
DEVICE_CONV2D_FWD_INSTANCE_SOURCE
}
)
add_library
(
device_convnd_2d_fwd_instance OBJECT
${
DEVICE_CONVND_2D_FWD_INSTANCE_SOURCE
}
)
set_target_properties
(
device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
set_target_properties
(
device_convnd_2d_fwd_instance PROPERTIES POSITION_INDEPENDENT_CODE ON
)
clang_tidy_check
(
device_conv2d_fwd_instance
)
clang_tidy_check
(
device_convnd_2d_fwd_instance
)
library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
View file @
11edd0f0
This diff is collapsed.
Click to expand it.
library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
View file @
11edd0f0
This diff is collapsed.
Click to expand it.
library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
View file @
11edd0f0
This diff is collapsed.
Click to expand it.
library/src/tensor_operation_instance/gpu/conv2d_fwd/device_convnd_2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
View file @
11edd0f0
This diff is collapsed.
Click to expand it.
library/src/utility/convolution_parameter.cpp
View file @
11edd0f0
...
...
@@ -45,17 +45,17 @@ ConvParams::ConvParams(ck::index_t n_dim,
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const
ck
::
index_t
id
x_eff
=
(
filter_spatial_lengths_
[
i
]
-
1
)
*
conv_filter_dilations_
[
i
]
+
1
;
const
ck
::
index_t
x_eff
=
(
filter_spatial_lengths_
[
i
]
-
1
)
*
conv_filter_dilations_
[
i
]
+
1
;
output_spatial_lengths_
[
i
]
=
(
input_spatial_lengths_
[
i
]
+
input_left_pads_
[
i
]
+
input_right_pads_
[
i
]
-
id
x_eff
)
/
(
input_spatial_lengths_
[
i
]
+
input_left_pads_
[
i
]
+
input_right_pads_
[
i
]
-
x_eff
)
/
conv_filter_strides_
[
i
]
+
1
;
}
}
ConvParams
::
ConvParams
()
:
ConvParams
::
ConvParams
(
2
,
128
,
256
,
192
,
{
3
,
3
},
{
71
,
71
},
{
2
,
2
},
{
1
,
1
},
{
2
,
2
},
{
2
,
2
})
:
ConvParams
::
ConvParams
(
2
,
128
,
256
,
192
,
{
3
,
3
},
{
71
,
71
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
})
{
}
...
...
profiler/CMakeLists.txt
View file @
11edd0f0
...
...
@@ -5,45 +5,45 @@ include_directories(BEFORE
# ck_profiler
set
(
PROFILER_SOURCE
src/profiler.cpp
src/profile_gemm.cpp
src/profile_gemm_splitk.cpp
src/profile_gemm_bilinear.cpp
src/profile_gemm_bias_add_reduce.cpp
src/profile_gemm_add_add_fastgelu.cpp
src/profile_gemm_reduce.cpp
src/profile_batched_gemm.cpp
src/profile_batched_gemm_reduce.cpp
src/profile_grouped_gemm.cpp
src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp
src/profile_convnd_fwd.cpp
src/profile_convnd_bwd_data.cpp
src/profile_conv_bwd_weight.cpp
src/profile_convnd_bwd_weight.cpp
src/profile_reduce.cpp
src/profile_normalization.cpp
# src/profile_gemm.cpp
# src/profile_gemm_splitk.cpp
# src/profile_gemm_bilinear.cpp
# src/profile_gemm_bias_add_reduce.cpp
# src/profile_gemm_add_add_fastgelu.cpp
# src/profile_gemm_reduce.cpp
# src/profile_batched_gemm.cpp
# src/profile_batched_gemm_reduce.cpp
# src/profile_grouped_gemm.cpp
src/profile_conv_fwd.cpp
# src/profile_conv_fwd_bias_relu.cpp
# src/profile_conv_fwd_bias_relu_add.cpp
# src/profile_convnd_fwd.cpp
# src/profile_convnd_bwd_data.cpp
# src/profile_conv_bwd_weight.cpp
# src/profile_convnd_bwd_weight.cpp
# src/profile_reduce.cpp
# src/profile_normalization.cpp
)
add_executable
(
ckProfiler
${
PROFILER_SOURCE
}
)
target_link_libraries
(
ckProfiler PRIVATE host_tensor
)
target_link_libraries
(
ckProfiler PRIVATE conv_util
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_splitk_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bilinear_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_add_add_fastgelu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_gemm_bias_add_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_batched_gemm_reduce_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_grouped_gemm_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv1d_fwd_instance
)
target_link_libraries
(
ckProfiler PRIVATE utility
)
#target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
#target_link_libraries(ckProfiler PRIVATE device_gemm_splitk_instance)
#target_link_libraries(ckProfiler PRIVATE device_gemm_bilinear_instance)
#target_link_libraries(ckProfiler PRIVATE device_gemm_add_add_fastgelu_instance)
#target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
#target_link_libraries(ckProfiler PRIVATE device_gemm_bias_add_reduce_instance)
#target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance)
#target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance)
#target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance)
#target_link_libraries(ckProfiler PRIVATE device_conv1d_fwd_instance)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv3d_fwd_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_convnd_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_convnd_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_normalization_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
#
target_link_libraries(ckProfiler PRIVATE device_conv3d_fwd_instance)
#
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
#
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
#
target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
#
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance)
#
target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_weight_instance)
#
target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
#
target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
profiler/include/profile_conv_fwd_impl.hpp
View file @
11edd0f0
...
...
@@ -15,14 +15,16 @@
#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace
ck
{
namespace
profiler
{
// FIXME: only support NCHW and NHWC layout, need to be more general
template
<
ck
::
index_t
NumDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
...
...
@@ -34,70 +36,126 @@ int profile_conv_fwd_impl(int do_verification,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParams
&
params
)
const
ck
::
tensor_operation
::
device
::
ConvParams
&
params
)
{
bool
pass
=
true
;
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
is_same
<
decltype
(
layout
),
tensor_layout
::
gemm
::
RowMajor
>::
value
)
// make host tensor descritpor
auto
f_nhwc_host_tensor_descriptor
=
[](
ck
::
index_t
n
,
ck
::
index_t
c
,
std
::
vector
<
ck
::
index_t
>
spatial_lengths
)
{
std
::
vector
<
std
::
size_t
>
nhwc_lengths
{
static_cast
<
std
::
size_t
>
(
n
),
static_cast
<
std
::
size_t
>
(
c
)};
nhwc_lengths
.
insert
(
nhwc_lengths
.
begin
()
+
1
,
spatial_lengths
.
begin
(),
spatial_lengths
.
end
());
return
HostTensorDescriptor
(
nhwc_lengths
);
};
auto
f_nchw_host_tensor_descriptor
=
[](
ck
::
index_t
n
,
ck
::
index_t
c
,
std
::
vector
<
ck
::
index_t
>
spatial_lengths
)
{
std
::
vector
<
std
::
size_t
>
nchw_lengths
{
static_cast
<
std
::
size_t
>
(
n
),
static_cast
<
std
::
size_t
>
(
c
)};
nchw_lengths
.
insert
(
nchw_lengths
.
end
(),
spatial_lengths
.
begin
(),
spatial_lengths
.
end
());
return
HostTensorDescriptor
(
nchw_lengths
);
};
HostTensorDescriptor
in_desc
,
wei_desc
,
out_desc
;
// FIXME: properly implement "make host descriptor" for different layout
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NWC
>
||
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NHWC
>
||
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWC
>
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
})
);
in_desc
=
f_nhwc_host_tensor_descriptor
(
params
.
N_
,
params
.
C_
,
params
.
input_spatial_lengths_
);
}
else
else
if
constexpr
(
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NCW
>
||
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NCHW
>
||
is_same_v
<
InLayout
,
ck
::
tensor_layout
::
convolution
::
NCDHW
>
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
})
);
in_desc
=
f_nchw_host_tensor_descriptor
(
params
.
N_
,
params
.
C_
,
params
.
input_spatial_lengths_
);
}
};
Tensor
<
ADataType
>
a_m_k
(
f_host_tensor_descriptor
(
M
,
K
,
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
// FIXME: properly implement "make host descriptor" for different layout
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KXC
>
||
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KYXC
>
||
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KZYXC
>
)
{
wei_desc
=
f_nhwc_host_tensor_descriptor
(
params
.
K_
,
params
.
C_
,
params
.
filter_spatial_lengths_
);
}
else
if
constexpr
(
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KCX
>
||
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KCYX
>
||
is_same_v
<
WeiLayout
,
ck
::
tensor_layout
::
convolution
::
KCZYX
>
)
{
wei_desc
=
f_nchw_host_tensor_descriptor
(
params
.
K_
,
params
.
C_
,
params
.
filter_spatial_lengths_
);
}
// FIXME: properly implement "make host descriptor" for different layout
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NWK
>
||
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
||
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
)
{
out_desc
=
f_nhwc_host_tensor_descriptor
(
params
.
N_
,
params
.
K_
,
params
.
GetOutputSpatialLengths
());
}
else
if
constexpr
(
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NKW
>
||
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NKHW
>
||
is_same_v
<
OutLayout
,
ck
::
tensor_layout
::
convolution
::
NKDHW
>
)
{
out_desc
=
f_nchw_host_tensor_descriptor
(
params
.
N_
,
params
.
K_
,
params
.
GetOutputSpatialLengths
());
}
Tensor
<
InDataType
>
input
(
in_desc
);
Tensor
<
WeiDataType
>
weight
(
wei_desc
);
Tensor
<
OutDataType
>
host_output
(
out_desc
);
Tensor
<
OutDataType
>
device_output
(
out_desc
);
std
::
cout
<<
"
a_m_k
: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"
b_k_n
: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"
c_m_n
: "
<<
c_m_n_device_resul
t
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"
input
: "
<<
input
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"
weight
: "
<<
weight
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"
output
: "
<<
host_outpu
t
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
A
DataType
>
{
-
5
,
5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
B
DataType
>
{
-
5
,
5
});
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
In
DataType
>
{
-
5
,
5
});
weight
.
GenerateTensorValue
(
GeneratorTensor_2
<
Wei
DataType
>
{
-
5
,
5
});
break
;
default:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
A
DataType
>
{
0.0
,
1.0
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
B
DataType
>
{
-
0.5
,
0.5
});
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
In
DataType
>
{
0.0
,
1.0
});
weight
.
GenerateTensorValue
(
GeneratorTensor_3
<
Wei
DataType
>
{
-
0.5
,
0.5
});
}
using
A
ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
B
ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
C
ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
In
ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Wei
ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Out
ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
const
auto
a
_element_op
=
A
ElementOp
{};
const
auto
b
_element_op
=
B
ElementOp
{};
const
auto
c
_element_op
=
C
ElementOp
{};
const
auto
in
_element_op
=
In
ElementOp
{};
const
auto
wei
_element_op
=
Wei
ElementOp
{};
const
auto
out
_element_op
=
Out
ElementOp
{};
DeviceMem
a
_device_buf
(
sizeof
(
A
DataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b
_device_buf
(
sizeof
(
B
DataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c
_device_buf
(
sizeof
(
C
DataType
)
*
c_m_n_
device_
resul
t
.
mDesc
.
GetElementSpace
());
DeviceMem
in
_device_buf
(
sizeof
(
In
DataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei
_device_buf
(
sizeof
(
Wei
DataType
)
*
weight
.
mDesc
.
GetElementSpace
());
DeviceMem
out
_device_buf
(
sizeof
(
Out
DataType
)
*
device_
outpu
t
.
mDesc
.
GetElementSpace
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
c_device_buf
.
ToDevice
(
c_m_n_device_result
.
mData
.
data
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weight
.
mData
.
data
());
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceConvFwd
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
OutLayout
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
...
@@ -105,94 +163,107 @@ int profile_conv_fwd_impl(int do_verification,
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
//
R
un reference
GEMM
//
r
un reference
op
if
(
do_verification
)
{
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_op
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_op
.
MakeInvoker
();
auto
ref_argument
=
ref_op
.
MakeArgument
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
OutLayout
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weight
,
host_output
,
params
.
conv_filter_strides_
,
params
.
conv_filter_dilations_
,
params
.
input_left_pads_
,
params
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
out_element_op
);
// init host output to zero
host_output
.
SetZero
();
ref_invoker
.
Run
(
ref_argument
);
}
std
::
string
best_op_name
;
float
best_av
e
_time
=
0
;
float
best_av
g
_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device
GEMM
instances
// profile device
op
instances
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
);
op_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N_
,
params
.
K_
,
params
.
C_
,
params
.
input_spatial_lengths_
,
params
.
filter_spatial_lengths_
,
params
.
GetOutputSpatialLengths
(),
params
.
conv_filter_strides_
,
params
.
conv_filter_dilations_
,
params
.
input_left_pads_
,
params
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
out_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
// re-init
C
to zero before profiling next kernel
c
_device_buf
.
SetZero
();
// re-init
output
to zero before profiling next kernel
out
_device_buf
.
SetZero
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
float
av
e
_time
=
float
av
g
_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
flop
=
params
.
GetFlops
();
std
::
size_t
num_btype
=
params
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
();
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E
9
/
av
e
_time
;
float
gb_per_sec
=
num_btype
/
1.E
6
/
av
g
_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_av
e
_time
=
av
e
_time
;
best_av
g
_time
=
av
g
_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
c
_device_buf
.
FromDevice
(
c_m_n_
device_
resul
t
.
mData
.
data
());
out
_device_buf
.
FromDevice
(
device_
outpu
t
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
device_output
.
mData
,
host_output
.
mData
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"
a
: "
,
a_m_k
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"
b: "
,
b_k_n
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"
c_
host
: "
,
c_m_n_host_resul
t
.
mData
,
","
)
LogRangeAsType
<
float
>
(
std
::
cout
<<
"
input
: "
,
input
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"
weight: "
,
weight
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"host
_output : "
,
host_outpu
t
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"
c_
device
: "
,
c_m_n_
device_
resul
t
.
mData
,
","
)
LogRangeAsType
<
float
>
(
std
::
cout
<<
"device
_output: "
,
device_
outpu
t
.
mData
,
","
)
<<
std
::
endl
;
}
}
...
...
@@ -203,47 +274,11 @@ int profile_conv_fwd_impl(int do_verification,
}
}
if
constexpr
(
is_same
<
CDataType
,
float
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = f32"
;
}
else
if
constexpr
(
is_same
<
CDataType
,
half_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = f16"
;
}
else
if
constexpr
(
is_same
<
CDataType
,
bhalf_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = bf16"
;
}
else
if
constexpr
(
is_same
<
CDataType
,
int8_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = int8"
;
}
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
std
::
cout
<<
" ALayout = RowMajor"
;
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
)
{
std
::
cout
<<
" ALayout = ColumnMajor"
;
}
if
constexpr
(
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
std
::
cout
<<
" BLayout = RowMajor"
;
}
else
if
constexpr
(
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
)
{
std
::
cout
<<
" BLayout = ColumnMajor"
;
}
std
::
cout
<<
" M = "
<<
M
<<
" N = "
<<
N
<<
" K = "
<<
K
<<
" StrideA = "
<<
StrideA
<<
" StrideB = "
<<
StrideB
<<
" StrideC = "
<<
StrideC
<<
" : "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
std
::
cout
<<
"Best configuration parameters:"
<<
"
\n
name: "
<<
best_op_name
<<
"
\n
avg_time: "
<<
best_avg_time
<<
"
\n
tflops: "
<<
best_tflops
<<
"
\n
GB/s: "
<<
best_gb_per_sec
<<
std
::
endl
;
return
pass
?
0
:
1
;
return
0
;
}
}
// namespace profiler
...
...
profiler/include/profile_gemm_impl.hpp
View file @
11edd0f0
...
...
@@ -23,13 +23,13 @@
namespace
ck
{
namespace
profiler
{
template
<
typename
ADataType
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
typename
CDataType
>
int
profile_gemm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
...
...
@@ -92,7 +92,6 @@ int profile_gemm_impl(int do_verification,
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
c_device_buf
.
ToDevice
(
c_m_n_device_result
.
mData
.
data
());
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemm
<
ALayout
,
BLayout
,
...
...
@@ -110,7 +109,7 @@ int profile_gemm_impl(int do_verification,
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
// Run reference
GEMM
// Run reference
op
if
(
do_verification
)
{
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
...
...
@@ -131,11 +130,11 @@ int profile_gemm_impl(int do_verification,
}
std
::
string
best_op_name
;
float
best_av
e
_time
=
0
;
float
best_av
g
_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device
GEMM
instances
// profile device
op
instances
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
...
...
@@ -161,7 +160,7 @@ int profile_gemm_impl(int do_verification,
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
float
av
e
_time
=
float
av
g
_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
...
...
@@ -169,18 +168,18 @@ int profile_gemm_impl(int do_verification,
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
av
e
_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
av
g
_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
av
e
_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
av
g
_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
av
e
_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
av
g
_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_av
e
_time
=
av
e
_time
;
best_av
g
_time
=
av
g
_time
;
best_gb_per_sec
=
gb_per_sec
;
}
...
...
@@ -244,7 +243,7 @@ int profile_gemm_impl(int do_verification,
}
std
::
cout
<<
" M = "
<<
M
<<
" N = "
<<
N
<<
" K = "
<<
K
<<
" StrideA = "
<<
StrideA
<<
" StrideB = "
<<
StrideB
<<
" StrideC = "
<<
StrideC
<<
" : "
<<
best_av
e
_time
<<
" StrideB = "
<<
StrideB
<<
" StrideC = "
<<
StrideC
<<
" : "
<<
best_av
g
_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
...
...
profiler/src/profile_conv_fwd.cpp
0 → 100644
View file @
11edd0f0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_conv_fwd_impl.hpp"
enum
struct
ConvLayout
{
NCHW_KYXC_NKHW
,
// 0
NHWC_KYXC_NHWK
,
// 1
};
enum
struct
ConvDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
};
static
void
print_helper_msg
()
{
// clang-format-off
std
::
cout
<<
"arg1: tensor operation (conv_fwd: ForwardConvolution)
\n
"
<<
"arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)
\n
"
<<
"arg3: tensor layout (0: Input[N, C, Hi, Wi] * Weight[K, C, Y, X] = Output[N, K, "
"Ho, Wo]
\n
"
<<
" 1: Input[N, Hi, Wi, C] * Weight[K, Y, X, C] = Output[N, Ho, "
"Wo, K])
\n
"
<<
"arg4: verification (0: no, 1: yes)
\n
"
<<
"arg5: initialization (0: no init, 1: integer value, 2: decimal value)
\n
"
<<
"arg6: print tensor value (0: no; 1: yes)
\n
"
<<
"arg7: time kernel (0: no, 1: yes)
\n
"
<<
"arg8: N spatial dimensions
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
<<
" <filter spatial dimensions>, (ie Y, X for 2D)
\n
"
<<
" <input image spatial dimensions>, (ie Hi, Wi for 2D)
\n
"
<<
" <strides>, (ie Sy, Sx for 2D)
\n
"
<<
" <dilations>, (ie Dy, Dx for 2D)
\n
"
<<
" <left padding>, (ie LeftPy, LeftPx for 2D)
\n
"
<<
" <right padding>, (ie RightPy, RightPx for 2D)
\n
"
<<
std
::
endl
;
// clang-format-on
}
ck
::
tensor_operation
::
device
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
int
arg_idx
,
char
*
const
argv
[])
{
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
const
ck
::
index_t
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
input_left_pads
(
num_dim_spatial
);
std
::
vector
<
ck
::
index_t
>
input_right_pads
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
ck
::
tensor_operation
::
device
::
ConvParams
{
num_dim_spatial
,
N
,
K
,
C
,
filter_spatial_lengths
,
input_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
};
}
int
profile_conv_fwd
(
int
argc
,
char
*
argv
[])
{
// 8 for control, 1 for num_dim_spatial
if
(
argc
<
9
)
{
print_helper_msg
();
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
ConvDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
ConvLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
num_dim_spatial
=
std
::
stoi
(
argv
[
8
]);
// 8 for control, 1 for num_dim_spatial, 3 for N/K/C, and 6 * num_dim_spatial
if
(
argc
!=
8
+
4
+
6
*
num_dim_spatial
)
{
print_helper_msg
();
exit
(
1
);
}
const
auto
params
=
parse_conv_params
(
num_dim_spatial
,
9
,
argv
);
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
using
INT8
=
int8_t
;
using
NWC
=
ck
::
tensor_layout
::
convolution
::
NWC
;
using
NHWC
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
NDHWC
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
KXC
=
ck
::
tensor_layout
::
convolution
::
KXC
;
using
KYXC
=
ck
::
tensor_layout
::
convolution
::
KYXC
;
using
KZYXC
=
ck
::
tensor_layout
::
convolution
::
KZYXC
;
using
NWK
=
ck
::
tensor_layout
::
convolution
::
NWK
;
using
NHWK
=
ck
::
tensor_layout
::
convolution
::
NHWK
;
using
NDHWK
=
ck
::
tensor_layout
::
convolution
::
NDHWK
;
constexpr
auto
I1
=
ck
::
Number
<
1
>
{};
constexpr
auto
I2
=
ck
::
Number
<
2
>
{};
constexpr
auto
I3
=
ck
::
Number
<
3
>
{};
auto
profile
=
[
&
](
auto
num_dim_spatial_tmp
,
auto
in_type
,
auto
wei_type
,
auto
out_type
,
auto
in_layout
,
auto
wei_layout
,
auto
out_layout
)
{
constexpr
ck
::
index_t
NumDimSpatial
=
num_dim_spatial_tmp
.
value
;
using
InDataType
=
decltype
(
in_type
);
using
WeiDataType
=
decltype
(
wei_type
);
using
OutDataType
=
decltype
(
out_type
);
using
InLayout
=
decltype
(
in_layout
);
using
WeiLayout
=
decltype
(
wei_layout
);
using
OutLayout
=
decltype
(
out_layout
);
bool
pass
=
ck
::
profiler
::
profile_conv_fwd_impl
<
NumDimSpatial
,
InDataType
,
WeiDataType
,
OutDataType
,
InLayout
,
WeiLayout
,
OutLayout
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
params
);
return
pass
?
0
:
1
;
};
if
(
num_dim_spatial
==
1
&&
layout
==
ConvLayout
::
NHWC_KYXC_NHWK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I1
,
NWC
{},
KXC
{},
NWK
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I1
,
NWC
{},
KXC
{},
NWK
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
return
profile
(
I1
,
NWC
{},
KXC
{},
NWK
{},
BF16
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
ConvDataType
::
INT8_INT8_INT8
)
{
return
profile
(
I1
,
NWC
{},
KXC
{},
NWK
{},
INT8
{},
INT8
{},
INT8
{});
}
}
else
if
(
num_dim_spatial
==
2
&&
layout
==
ConvLayout
::
NHWC_KYXC_NHWK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I2
,
NHWC
{},
KYXC
{},
NHWK
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I2
,
NHWC
{},
KYXC
{},
NHWK
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
return
profile
(
I2
,
NHWC
{},
KYXC
{},
NHWK
{},
BF16
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
ConvDataType
::
INT8_INT8_INT8
)
{
return
profile
(
I2
,
NHWC
{},
KYXC
{},
NHWK
{},
INT8
{},
INT8
{},
INT8
{});
}
}
else
if
(
num_dim_spatial
==
3
&&
layout
==
ConvLayout
::
NHWC_KYXC_NHWK
)
{
if
(
data_type
==
ConvDataType
::
F32_F32_F32
)
{
return
profile
(
I3
,
NDHWC
{},
KZYXC
{},
NDHWK
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
)
{
return
profile
(
I3
,
NDHWC
{},
KZYXC
{},
NDHWK
{},
F16
{},
F16
{},
F16
{});
}
else
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
)
{
return
profile
(
I3
,
NDHWC
{},
KZYXC
{},
NDHWK
{},
BF16
{},
BF16
{},
BF16
{});
}
else
if
(
data_type
==
ConvDataType
::
INT8_INT8_INT8
)
{
return
profile
(
I3
,
NDHWC
{},
KZYXC
{},
NDHWK
{},
INT8
{},
INT8
{},
INT8
{});
}
}
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
profiler/src/profile_gemm.cpp
View file @
11edd0f0
...
...
@@ -24,21 +24,27 @@ enum struct GemmDataType
INT8_INT8_INT8
,
// 3
};
static
void
print_helper_msg
()
{
std
::
cout
<<
"arg1: tensor operation (gemm: GEMM)
\n
"
<<
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)
\n
"
<<
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
<<
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
<<
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
<<
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
<<
"arg4: verification (0: no; 1: yes)
\n
"
<<
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg6: print tensor value (0: no; 1: yes)
\n
"
<<
"arg7: time kernel (0: no, 1: yes)
\n
"
<<
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
<<
std
::
endl
;
}
int
profile_gemm
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
14
)
{
printf
(
"arg1: tensor operation (gemm: GEMM)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg6: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
print_helper_msg
();
exit
(
1
);
}
...
...
@@ -109,67 +115,67 @@ int profile_gemm(int argc, char* argv[])
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Row
{},
Row
{},
Row
{});
return
profile
(
Row
{},
Row
{},
Row
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Row
{},
Col
{},
Row
{});
return
profile
(
Row
{},
Col
{},
Row
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Col
{},
Row
{},
Row
{});
return
profile
(
Col
{},
Row
{},
Row
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
return
profile
(
F32
{},
F32
{},
F32
{},
F32
{},
Col
{},
Col
{},
Row
{});
return
profile
(
Col
{},
Col
{},
Row
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{}
,
Row
{},
Row
{},
Row
{}
);
return
profile
(
Row
{},
Row
{},
Row
{},
F16
{},
F16
{},
F32
{},
F16
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{}
,
Row
{},
Col
{},
Row
{}
);
return
profile
(
Row
{},
Col
{},
Row
{},
F16
{},
F16
{},
F32
{},
F16
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{}
,
Col
{},
Row
{},
Row
{}
);
return
profile
(
Col
{},
Row
{},
Row
{},
F16
{},
F16
{},
F32
{},
F16
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{}
,
Col
{},
Col
{},
Row
{}
);
return
profile
(
Col
{},
Col
{},
Row
{},
F16
{},
F16
{},
F32
{},
F16
{});
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
BF16
{},
BF16
{},
F32
{},
BF16
{}
,
Row
{},
Row
{},
Row
{}
);
return
profile
(
Row
{},
Row
{},
Row
{},
BF16
{},
BF16
{},
F32
{},
BF16
{});
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
BF16
{},
BF16
{},
F32
{},
BF16
{}
,
Row
{},
Col
{},
Row
{}
);
return
profile
(
Row
{},
Col
{},
Row
{},
BF16
{},
BF16
{},
F32
{},
BF16
{});
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
return
profile
(
BF16
{},
BF16
{},
F32
{},
BF16
{}
,
Col
{},
Row
{},
Row
{}
);
return
profile
(
Col
{},
Row
{},
Row
{},
BF16
{},
BF16
{},
F32
{},
BF16
{});
}
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
return
profile
(
BF16
{},
BF16
{},
F32
{},
BF16
{}
,
Col
{},
Col
{},
Row
{}
);
return
profile
(
Col
{},
Col
{},
Row
{},
BF16
{},
BF16
{},
F32
{},
BF16
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
INT8
{},
INT8
{},
INT32
{},
INT8
{}
,
Row
{},
Row
{},
Row
{}
);
return
profile
(
Row
{},
Row
{},
Row
{},
INT8
{},
INT8
{},
INT32
{},
INT8
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
INT8
{},
INT8
{},
INT32
{},
INT8
{}
,
Row
{},
Col
{},
Row
{}
);
return
profile
(
Row
{},
Col
{},
Row
{},
INT8
{},
INT8
{},
INT32
{},
INT8
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
return
profile
(
INT8
{},
INT8
{},
INT32
{},
INT8
{}
,
Col
{},
Row
{},
Row
{}
);
return
profile
(
Col
{},
Row
{},
Row
{},
INT8
{},
INT8
{},
INT32
{},
INT8
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
return
profile
(
INT8
{},
INT8
{},
INT32
{},
INT8
{}
,
Col
{},
Col
{},
Row
{}
);
return
profile
(
Col
{},
Col
{},
Row
{},
INT8
{},
INT8
{},
INT32
{},
INT8
{});
}
else
{
...
...
profiler/src/profiler.cpp
View file @
11edd0f0
...
...
@@ -3,24 +3,23 @@
#include <cstring>
int
profile_gemm
(
int
,
char
*
[]);
int
profile_gemm_splitk
(
int
,
char
*
[]);
int
profile_gemm_bilinear
(
int
,
char
*
[]);
int
profile_gemm_add_add_fastgelu
(
int
,
char
*
[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_gemm_bias_add_reduce
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_batched_gemm_reduce
(
int
,
char
*
[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
//
int profile_gemm(int, char*[]);
//
int profile_gemm_splitk(int, char*[]);
//
int profile_gemm_bilinear(int, char*[]);
//
int profile_gemm_add_add_fastgelu(int, char*[]);
//
int profile_gemm_reduce(int, char*[]);
//
int profile_gemm_bias_add_reduce(int, char*[]);
//
int profile_batched_gemm(int, char*[]);
//
int profile_batched_gemm_reduce(int, char*[]);
//
int profile_grouped_gemm(int, char*[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
int
profile_convnd_fwd
(
int
argc
,
char
*
argv
[]);
int
profile_convnd_bwd_data
(
int
,
char
*
[],
int
);
int
profile_conv_bwd_weight
(
int
,
char
*
[]);
int
profile_normalization
(
int
,
char
*
[]);
int
profile_reduce
(
int
,
char
*
[]);
int
profile_convnd_bwd_weight
(
int
,
char
*
[],
int
);
// int profile_conv_fwd_bias_relu(int, char*[]);
// int profile_conv_fwd_bias_relu_add(int, char*[]);
// int profile_convnd_bwd_data(int, char*[], int);
// int profile_conv_bwd_weight(int, char*[]);
// int profile_normalization(int, char*[]);
// int profile_reduce(int, char*[]);
// int profile_convnd_bwd_weight(int, char*[], int);
static
void
print_helper_message
()
{
...
...
@@ -54,6 +53,7 @@ int main(int argc, char* argv[])
return
0
;
}
#if 0
if(strcmp(argv[1], "gemm") == 0)
{
return profile_gemm(argc, argv);
...
...
@@ -90,10 +90,12 @@ int main(int argc, char* argv[])
{
return profile_grouped_gemm(argc, argv);
}
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
#endif
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
{
return
profile_conv
nd
_fwd
(
argc
,
argv
);
return
profile_conv_fwd
(
argc
,
argv
);
}
#if 0
else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0)
{
return profile_conv_fwd_bias_relu(argc, argv);
...
...
@@ -139,6 +141,7 @@ int main(int argc, char* argv[])
{
return profile_normalization(argc, argv);
}
#endif
else
{
print_helper_message
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment