Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b2290854
Commit
b2290854
authored
May 27, 2022
by
rocking
Browse files
Merge commit '
3e6c2610
' into gemm_norm
parents
253f7ef2
3e6c2610
Changes
201
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
280 additions
and
405 deletions
+280
-405
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
...e_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
+0
-24
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
...ce_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
+0
-40
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+1
-0
profiler/include/profile_gemm_impl.hpp
profiler/include/profile_gemm_impl.hpp
+117
-18
profiler/include/profile_gemm_reduce_impl.hpp
profiler/include/profile_gemm_reduce_impl.hpp
+7
-2
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+2
-0
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+120
-308
profiler/src/profile_batched_gemm.cpp
profiler/src/profile_batched_gemm.cpp
+1
-1
profiler/src/profile_batched_gemm_reduce.cpp
profiler/src/profile_batched_gemm_reduce.cpp
+1
-1
profiler/src/profile_conv_bwd_weight.cpp
profiler/src/profile_conv_bwd_weight.cpp
+1
-1
profiler/src/profile_conv_fwd_bias_relu.cpp
profiler/src/profile_conv_fwd_bias_relu.cpp
+1
-1
profiler/src/profile_conv_fwd_bias_relu_add.cpp
profiler/src/profile_conv_fwd_bias_relu_add.cpp
+1
-1
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
+1
-1
profiler/src/profile_convnd_fwd.cpp
profiler/src/profile_convnd_fwd.cpp
+1
-1
profiler/src/profile_gemm.cpp
profiler/src/profile_gemm.cpp
+17
-1
profiler/src/profile_gemm_bias_2d.cpp
profiler/src/profile_gemm_bias_2d.cpp
+1
-1
profiler/src/profile_gemm_bias_relu.cpp
profiler/src/profile_gemm_bias_relu.cpp
+1
-1
profiler/src/profile_gemm_bias_relu_add.cpp
profiler/src/profile_gemm_bias_relu_add.cpp
+1
-1
profiler/src/profile_gemm_reduce.cpp
profiler/src/profile_gemm_reduce.cpp
+1
-1
profiler/src/profile_grouped_gemm.cpp
profiler/src/profile_grouped_gemm.cpp
+5
-1
No files found.
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
deleted
100644 → 0
View file @
253f7ef2
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
deleted
100644 → 0
View file @
253f7ef2
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/CMakeLists.txt
View file @
b2290854
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include/ck
${
PROJECT_SOURCE_DIR
}
/include/ck/utility
${
PROJECT_SOURCE_DIR
}
/include/ck/host_utility
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_description
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor
${
PROJECT_SOURCE_DIR
}
/include/ck/problem_transform
...
...
profiler/include/profile_gemm_impl.hpp
View file @
b2290854
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include "check_err.hpp"
#include "config.hpp"
...
...
@@ -42,14 +44,10 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(std::vector<De
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
...
...
@@ -74,6 +72,21 @@ void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(std::vector<Devic
void
add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
}
// namespace device_gemm_instance
}
// namespace device
}
// namespace tensor_operation
...
...
@@ -85,6 +98,7 @@ namespace profiler {
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
...
...
@@ -125,7 +139,11 @@ void profile_gemm_impl(int do_verification,
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
break
;
// case 0: break;
case
0
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
BDataType
>
{},
num_thread
);
break
;
case
1
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
},
num_thread
);
...
...
@@ -174,6 +192,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -192,6 +213,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -210,6 +234,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -228,6 +255,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -250,6 +280,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -268,6 +301,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances
(
gemm_ptrs
);
...
...
@@ -289,6 +325,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -307,6 +346,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -353,28 +395,40 @@ void profile_gemm_impl(int do_verification,
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances
(
gemm_ptrs
);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances
(
gemm_ptrs
);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances
(
gemm_ptrs
);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances
(
gemm_ptrs
);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances
(
gemm_ptrs
);
}
}
...
...
@@ -458,8 +512,14 @@ void profile_gemm_impl(int do_verification,
bf16_to_f32_
(
b_k_n
,
b_f32_k_n
);
bf16_to_f32_
(
c_m_n_device_result
,
c_m_n_device_f32_result
);
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
float
,
float
,
float
,
AElementOp
,
BElementOp
,
CElementOp
>
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
float
,
float
,
float
,
float
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
@@ -491,6 +551,7 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
...
...
@@ -523,12 +584,50 @@ void profile_gemm_impl(int do_verification,
}
else
{
std
::
cout
<<
"does not support this GEMM problem"
<<
std
::
endl
;
std
::
cout
<<
gemm_ptr
->
GetTypeString
()
<<
" does not support this GEMM problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
if
constexpr
(
is_same
<
CDataType
,
float
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = f32"
;
}
else
if
constexpr
(
is_same
<
CDataType
,
half_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = f16"
;
}
else
if
constexpr
(
is_same
<
CDataType
,
bhalf_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = bf16"
;
}
else
if
constexpr
(
is_same
<
CDataType
,
int8_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = int8"
;
}
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
std
::
cout
<<
" ALayout = RowMajor"
;
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
)
{
std
::
cout
<<
" ALayout = ColumnMajor"
;
}
if
constexpr
(
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
std
::
cout
<<
" BLayout = RowMajor"
;
}
else
if
constexpr
(
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
)
{
std
::
cout
<<
" BLayout = ColumnMajor"
;
}
std
::
cout
<<
" M = "
<<
M
<<
" N = "
<<
N
<<
" K = "
<<
K
<<
" StrideA = "
<<
StrideA
<<
" StrideB = "
<<
StrideB
<<
" StrideC = "
<<
StrideC
<<
" : "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
}
}
// namespace profiler
...
...
profiler/include/profile_gemm_reduce_impl.hpp
View file @
b2290854
...
...
@@ -146,8 +146,13 @@ bool profile_gemm_reduce_impl(int do_verification,
if
(
do_verification
)
{
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
DDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
profiler/include/profile_grouped_gemm_impl.hpp
View file @
b2290854
...
...
@@ -43,6 +43,7 @@ namespace profiler {
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
...
...
@@ -271,6 +272,7 @@ void profile_grouped_gemm_impl(int do_verification,
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
...
...
profiler/include/profile_reduce_impl.hpp
View file @
b2290854
...
...
@@ -5,74 +5,77 @@
#include "device_reduce_instance.hpp"
#include "reduction_enums.hpp"
#include "host_reduction.hpp"
#include "host_common_util.hpp"
#include "host_tensor_generator.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
template
<
int
Rank
,
int
NumReduceDim
,
int
ReduceOpId
,
int
NanOpt
,
int
IndicesOpt
>
template
<
int
Rank
,
int
NumReduceDim
,
int
ReduceOpId
,
bool
PropagateNan
,
bool
UseIndex
>
struct
ReduceDescription
{
static
constexpr
int
Rank_
=
Rank
;
static
constexpr
int
NumReduceDim_
=
NumReduceDim
;
static
constexpr
int
ReduceOpId_
=
ReduceOpId
;
static
constexpr
int
NanOpt_
=
NanOpt
;
static
constexpr
int
IndicesOpt_
=
IndicesOpt
;
static
constexpr
int
PropagateNan_
=
PropagateNan
;
static
constexpr
int
UseIndex_
=
UseIndex
;
};
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
0
,
0
>
,
// for ADD
ReduceDescription
<
4
,
4
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
5
,
0
,
0
>
,
// for AVG
ReduceDescription
<
4
,
4
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
7
,
0
,
0
>
,
// for NORM2
ReduceDescription
<
4
,
4
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
0
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
0
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
0
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
1
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
1
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
1
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
4
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
1
>>
;
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
false
,
false
>
,
// for ADD
ReduceDescription
<
4
,
4
,
0
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
5
,
false
,
false
>
,
// for AVG
ReduceDescription
<
4
,
4
,
5
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
7
,
false
,
false
>
,
// for NORM2
ReduceDescription
<
4
,
4
,
7
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
false
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
false
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
false
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
true
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
true
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
true
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
4
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
true
>>
;
template
<
typename
DescriptionType
>
bool
description_match
(
const
DescriptionType
&
description
,
int
Rank
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
)
bool
Propagat
eNan
,
bool
UseIndex
)
{
if
(
description
.
Rank_
!=
Rank
||
description
.
ReduceOpId_
!=
static_cast
<
int
>
(
ReduceOpId
)
||
description
.
Nan
Opt
_
!=
static_cast
<
int
>
(
Nan
Opt
)
||
description
.
IndicesOpt
_
!=
static_cast
<
int
>
(
IndicesOpt
))
description
.
Propagate
Nan_
!=
static_cast
<
int
>
(
Propagate
Nan
)
||
description
.
UseIndex
_
!=
static_cast
<
int
>
(
UseIndex
))
return
(
false
);
if
(
DescriptionType
::
NumReduceDim_
!=
reduceDims
.
size
())
...
...
@@ -116,46 +119,16 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return
invariantDims
;
};
template
<
typename
T
>
static
void
dumpBufferToFile
(
const
char
*
fileName
,
T
*
data
,
size_t
dataNumItems
)
{
std
::
ofstream
outFile
(
fileName
,
std
::
ios
::
binary
);
if
(
outFile
)
{
outFile
.
write
(
reinterpret_cast
<
char
*>
(
data
),
dataNumItems
*
sizeof
(
T
));
outFile
.
close
();
std
::
cout
<<
"Write output to file "
<<
fileName
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Could not open file "
<<
fileName
<<
" for writing"
<<
std
::
endl
;
}
};
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template
<
typename
InType
>
struct
type_mapping
{
using
OutType
=
InType
;
};
template
<
>
struct
type_mapping
<
ck
::
half_t
>
{
using
OutType
=
half_float
::
half
;
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
profile_reduce_impl_impl
(
bool
do_verification
,
bool
Propagat
eNan
,
bool
UseIndex
>
bool
profile_reduce_impl_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
...
...
@@ -166,15 +139,13 @@ void profile_reduce_impl_impl(bool do_verification,
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
host_reduce
;
using
ck
::
host_common
::
dumpBufferToFile
;
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
(
op_support_indices
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
));
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
PROPAGATE_NAN
);
constexpr
bool
OutputIndex
=
(
op_support_indices
&&
UseIndex
);
constexpr
bool
out_support_atomic_add
=
std
::
is_same
<
OutDataType
,
float
>::
value
;
constexpr
bool
op_support_atomic_add
=
...
...
@@ -195,8 +166,7 @@ void profile_reduce_impl_impl(bool do_verification,
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
// 1) The indices can only be used when the reduction operation is indexable
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
UseIndex
);
// 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
// 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
...
...
@@ -219,6 +189,8 @@ void profile_reduce_impl_impl(bool do_verification,
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
||
invalid_reduce_4
||
invalid_reduce_5
||
invalid_reduce_6
);
bool
pass
=
true
;
if
constexpr
(
!
invalid_reduce
)
{
Tensor
<
InDataType
>
in
(
inLengths
);
...
...
@@ -282,7 +254,7 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
size_t
indicesSizeInBytes
=
NeedIndices
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int
)
:
0
;
size_t
indicesSizeInBytes
=
OutputIndex
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int
)
:
0
;
DeviceMem
out_indices_dev
(
indicesSizeInBytes
);
...
...
@@ -295,29 +267,11 @@ void profile_reduce_impl_impl(bool do_verification,
using
AccElementwiseOperation_0
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
InElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
InElementwiseOperation
;
using
AccElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
;
using
InElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
;
using
DeviceReduceInstPtr0
=
DeviceReducePtr
<
InElementwiseOperation_0
,
AccElementwiseOperation_0
>
;
using
DeviceReduceInstPtr1
=
DeviceReducePtr
<
InElementwiseOperation_1
,
AccElementwiseOperation_1
>
;
using
DeviceReduceInstPtr2
=
DeviceReducePtr
<
InElementwiseOperation_2
,
AccElementwiseOperation_2
>
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
std
::
vector
<
DeviceReduceInstPtr1
>
reduce1_ptrs
;
std
::
vector
<
DeviceReduceInstPtr2
>
reduce2_ptrs
;
add_device_reduce_instance_threadwise
<
InDataType
,
AccDataType
,
...
...
@@ -325,8 +279,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
ReduceOpId
,
Nan
Opt
,
IndicesOpt
>
(
reduce0_ptrs
);
Propagate
Nan
,
UseIndex
>
(
reduce0_ptrs
);
add_device_reduce_instance_blockwise
<
InDataType
,
AccDataType
,
...
...
@@ -334,8 +288,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
ReduceOpId
,
Nan
Opt
,
IndicesOpt
>
(
reduce0_ptrs
);
Propagate
Nan
,
UseIndex
>
(
reduce0_ptrs
);
if
constexpr
(
use_atomic_add
)
{
...
...
@@ -345,35 +299,11 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
ReduceOpId
,
Nan
Opt
,
IndicesOpt
>
(
reduce0_ptrs
);
Propagate
Nan
,
UseIndex
>
(
reduce0_ptrs
);
}
else
{
add_device_reduce_instance_multiblock_partial_reduce
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce1_ptrs
);
};
// used for secondary reduction
if
constexpr
(
!
use_atomic_add
)
{
add_device_reduce_instance_blockwise_second_call
<
AccDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce2_ptrs
);
};
if
(
reduce0_ptrs
.
empty
()
&&
reduce1_ptrs
.
empty
())
if
(
reduce0_ptrs
.
empty
())
{
throw
std
::
runtime_error
(
"Wrong! No device REDUCE instance found"
);
};
...
...
@@ -387,23 +317,25 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
PropagateNan
,
NeedIndices
>
OutputIndex
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
};
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
const
auto
i_inStrides
=
to_int_vector
(
inStrides
);
const
auto
i_outLengths
=
to_int_vector
(
outLengths
);
const
auto
i_outStrides
=
to_int_vector
(
outStrides
);
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
std
::
vector
<
ck
::
index_t
>
i_inStrides
;
std
::
vector
<
ck
::
index_t
>
i_outLengths
;
std
::
vector
<
ck
::
index_t
>
i_outStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
{
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
InElementwiseOperation_0
in_elementwise_op_0
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_0
acc_elementwise_op_0
(
...
...
@@ -417,9 +349,9 @@ void profile_reduce_impl_impl(bool do_verification,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_0
,
acc_elementwise_op_0
);
...
...
@@ -439,8 +371,9 @@ void profile_reduce_impl_impl(bool do_verification,
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
{
...
...
@@ -450,22 +383,24 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
do_verification
)
{
bool
single_pass
;
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
single_pass
=
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
if
(
OutputIndex
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
single_pass
=
single_pass
&&
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
)
;
};
if
(
do_log
)
if
(
!
single_pass
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
;
std
::
cout
<<
"Fail Info: "
<<
reduce_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
pass
=
pass
&&
single_pass
;
};
if
(
do_dumpout
)
...
...
@@ -474,7 +409,7 @@ void profile_reduce_impl_impl(bool do_verification,
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
if
(
OutputIndex
)
{
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
...
...
@@ -486,158 +421,34 @@ void profile_reduce_impl_impl(bool do_verification,
};
};
for
(
auto
&
reduce_ptr
:
reduce1_ptrs
)
{
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
InElementwiseOperation_1
in_elementwise_op_1
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_1
acc_elementwise_op_1
(
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_1
,
acc_elementwise_op_1
);
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
std
::
string
reduce_name
=
reduce_ptr
->
GetTypeString
();
auto
invoker_ptr
=
reduce_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
sizeof
(
OutDataType
);
std
::
vector
<
int
>
inLengths2
=
reduce_ptr
->
GetWorkspace2dLengths
(
argument_ptr
.
get
());
std
::
vector
<
int
>
inStrides2
{
inLengths2
[
1
],
1
};
for
(
auto
&
reduce2_ptr
:
reduce2_ptrs
)
{
InElementwiseOperation_2
in_elementwise_op_2
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_2
acc_elementwise_op_2
(
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument2_ptr
=
reduce2_ptr
->
MakeArgumentPointer
(
inLengths2
,
inStrides2
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
ws_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_2
,
acc_elementwise_op_2
);
if
(
!
reduce2_ptr
->
IsSupportedArgument
(
argument2_ptr
.
get
()))
continue
;
std
::
string
reduce2_name
=
reduce2_ptr
->
GetTypeString
();
auto
invoker2_ptr
=
reduce2_ptr
->
MakeInvokerPointer
();
float
avg_time_2
=
invoker2_ptr
->
Run
(
argument2_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes_2
=
static_cast
<
size_t
>
(
inLengths2
[
0
])
*
inLengths2
[
1
]
*
sizeof
(
AccDataType
);
float
gb_per_sec
=
(
num_bytes
+
num_bytes_2
)
/
1.E6
/
(
avg_time
+
avg_time_2
);
std
::
cout
<<
"Perf: "
<<
(
avg_time
+
avg_time_2
)
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
" => "
<<
reduce2_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
{
best_avg_time
=
avg_time
+
avg_time_2
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
};
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
}
if
(
do_dumpout
)
{
dumpBufferToFile
(
"dump_in.bin"
,
in
.
mData
.
data
(),
in
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
{
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
out_indices
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_indices_host.bin"
,
out_indices_ref
.
mData
.
data
(),
out_indices_ref
.
mDesc
.
GetElementSize
());
};
};
};
};
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
if
(
time_kernel
)
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"The requested reduction operation is not supported, please check !!!"
<<
std
::
endl
;
};
return
pass
;
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
>
void
profile_reduce_impl
(
bool
do_verification
,
bool
profile_reduce_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
,
bool
Propagat
eNan
,
bool
UseIndex
,
float
alpha
,
float
beta
)
{
bool
matched
=
false
;
bool
pass
=
true
;
using
tuple_of_description_instances
=
tensor_operation
::
device
::
device_reduce_instance
::
reduce_description_instances
;
...
...
@@ -651,29 +462,30 @@ void profile_reduce_impl(bool do_verification,
using
descType
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
>
(
tuple_object
))
>
;
if
(
!
description_match
(
descType
{},
inLengths
.
size
(),
reduceDims
,
ReduceOpId
,
NanOpt
,
IndicesOpt
))
descType
{},
inLengths
.
size
(),
reduceDims
,
ReduceOpId
,
PropagateNan
,
UseIndex
))
return
;
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
OutDataType
,
descType
::
Rank_
,
descType
::
NumReduceDim_
,
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
static_cast
<
NanPropagation
>
(
descType
::
NanOpt_
),
static_cast
<
ReduceTensorIndices
>
(
descType
::
IndicesOpt_
)
>
(
do_verification
,
init_method
,
do_log
,
do_dumpout
,
time_kernel
,
inLengths
,
reduceDims
,
alpha
,
beta
);
pass
=
pass
&&
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
OutDataType
,
descType
::
Rank_
,
descType
::
NumReduceDim_
,
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
static_cast
<
bool
>
(
descType
::
PropagateNan_
),
static_cast
<
bool
>
(
descType
::
UseIndex_
)
>
(
do_verification
,
init_method
,
do_dumpout
,
time_kernel
,
inLengths
,
reduceDims
,
alpha
,
beta
);
matched
=
true
;
});
return
pass
;
};
}
// namespace profiler
...
...
profiler/src/profile_batched_gemm.cpp
View file @
b2290854
...
...
@@ -396,5 +396,5 @@ int profile_batched_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_batched_gemm_reduce.cpp
View file @
b2290854
...
...
@@ -149,5 +149,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_bwd_weight.cpp
View file @
b2290854
...
...
@@ -142,5 +142,5 @@ int profile_conv_bwd_weight(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this Conv data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_fwd_bias_relu.cpp
View file @
b2290854
...
...
@@ -110,5 +110,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_fwd_bias_relu_add.cpp
View file @
b2290854
...
...
@@ -111,5 +111,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
View file @
b2290854
...
...
@@ -112,5 +112,5 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_convnd_fwd.cpp
View file @
b2290854
...
...
@@ -347,5 +347,5 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
std
::
to_string
(
num_dim_spatial
));
}
return
1
;
return
0
;
}
profiler/src/profile_gemm.cpp
View file @
b2290854
...
...
@@ -68,6 +68,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -88,6 +89,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -108,6 +110,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -128,6 +131,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -146,6 +150,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
...
@@ -166,6 +171,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
...
@@ -186,6 +192,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
...
@@ -206,6 +213,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
...
@@ -228,6 +236,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -248,6 +257,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -268,6 +278,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -288,6 +299,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -308,6 +320,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -328,6 +341,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -348,6 +362,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -368,6 +383,7 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
...
...
@@ -388,5 +404,5 @@ int profile_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_bias_2d.cpp
View file @
b2290854
...
...
@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_bias_relu.cpp
View file @
b2290854
...
...
@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_bias_relu_add.cpp
View file @
b2290854
...
...
@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_reduce.cpp
View file @
b2290854
...
...
@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_grouped_gemm.cpp
View file @
b2290854
...
...
@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
...
@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
...
@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
...
@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
...
@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
Prev
1
…
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment