Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
95a83c6e
Commit
95a83c6e
authored
Nov 18, 2022
by
Adam Osewski
Browse files
Merge remote-tracking branch 'origin/develop' into wavelet_model
parents
5b7c2432
892a8d76
Changes
618
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
414 additions
and
221 deletions
+414
-221
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+27
-0
library/src/utility/convolution_parameter.cpp
library/src/utility/convolution_parameter.cpp
+4
-8
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+6
-4
profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp
...r/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp
+6
-6
profiler/include/profile_batched_gemm_gemm_impl.hpp
profiler/include/profile_batched_gemm_gemm_impl.hpp
+6
-6
profiler/include/profile_batched_gemm_impl.hpp
profiler/include/profile_batched_gemm_impl.hpp
+6
-6
profiler/include/profile_batched_gemm_reduce_impl.hpp
profiler/include/profile_batched_gemm_reduce_impl.hpp
+12
-18
profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
+23
-13
profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp
...nclude/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+145
-140
profiler/include/profile_conv_bwd_data_impl.hpp
profiler/include/profile_conv_bwd_data_impl.hpp
+1
-2
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
+7
-8
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
+7
-8
profiler/include/profile_conv_fwd_impl.hpp
profiler/include/profile_conv_fwd_impl.hpp
+1
-1
profiler/include/profile_convnd_bwd_data_impl.hpp
profiler/include/profile_convnd_bwd_data_impl.hpp
+1
-1
No files found.
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/utility/convolution_parameter.cpp
View file @
95a83c6e
...
...
@@ -72,14 +72,10 @@ std::size_t ConvParam::GetFlops() const
{
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
return
static_cast
<
std
::
size_t
>
(
2
)
*
G_
*
N_
*
K_
*
C_
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths_
),
std
::
begin
(
output_spatial_lengths_
)
+
num_dim_spatial_
,
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
())
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths_
),
std
::
begin
(
filter_spatial_lengths_
)
+
num_dim_spatial_
,
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
());
ck
::
accumulate_n
<
std
::
size_t
>
(
std
::
begin
(
output_spatial_lengths_
),
num_dim_spatial_
,
1
,
std
::
multiplies
<>
())
*
ck
::
accumulate_n
<
std
::
size_t
>
(
std
::
begin
(
filter_spatial_lengths_
),
num_dim_spatial_
,
1
,
std
::
multiplies
<>
());
}
std
::
string
get_conv_param_parser_helper_msg
()
...
...
profiler/CMakeLists.txt
View file @
95a83c6e
...
...
@@ -20,8 +20,8 @@ set(PROFILER_SOURCE
src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_bwd_data.cpp
src/profile_conv_bwd_weight.cpp
src/profile_grouped_conv_fwd.cpp
src/profile_grouped_conv_bwd_weight.cpp
src/profile_reduce.cpp
src/profile_groupnorm.cpp
src/profile_layernorm.cpp
...
...
@@ -49,11 +49,13 @@ target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_fwd_instance)
target_link_libraries
(
ckProfiler PRIVATE device_conv1d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv3d_bwd_data_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv1d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv3d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_
grouped_
conv1d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_
grouped_
conv2d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_
grouped_
conv3d_bwd_weight_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_normalization_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_softmax_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
rocm_install
(
TARGETS ckProfiler COMPONENT profiler
)
profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp
View file @
95a83c6e
...
...
@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -111,15 +112,15 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -330,8 +331,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
{
e1_g_m_o_device_buf
.
FromDevice
(
e1_g_m_o_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
e1_g_m_o_device_result
.
mData
,
e1_g_m_o_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
e1_g_m_o_device_result
,
e1_g_m_o_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_gemm_impl.hpp
View file @
95a83c6e
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -105,15 +106,15 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -283,8 +284,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
{
c_g_m_o_device_buf
.
FromDevice
(
c_g_m_o_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
.
mData
,
c_g_m_o_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
,
c_g_m_o_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_impl.hpp
View file @
95a83c6e
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -50,15 +51,15 @@ bool profile_batched_gemm_impl(int do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
is_same
<
decltype
(
layout
),
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -202,8 +203,7 @@ bool profile_batched_gemm_impl(int do_verification,
{
c_device_buf
.
FromDevice
(
c_g_m_n_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_n_device_result
.
mData
,
c_g_m_n_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_n_device_result
,
c_g_m_n_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_reduce_impl.hpp
View file @
95a83c6e
...
...
@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace
ck
{
...
...
@@ -78,15 +79,15 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
row
*
stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
row
*
stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
col
*
stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
col
*
stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -95,17 +96,13 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
Tensor
<
CDataType
>
c_g_m_n_host_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ReduceDataType
>
d0_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d1_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d0_g_m_host_result
({
BatchCount
,
M
});
Tensor
<
ReduceDataType
>
d1_g_m_host_result
({
BatchCount
,
M
});
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ReduceDataType
>
d0_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d1_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
ReduceDataType
>
d0_g_m_device_result
({
BatchCount
,
M
});
Tensor
<
ReduceDataType
>
d1_g_m_device_result
({
BatchCount
,
M
});
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
...
...
@@ -319,12 +316,9 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
reduce0_device_buf
.
FromDevice
(
d0_g_m_device_result
.
mData
.
data
());
reduce1_device_buf
.
FromDevice
(
d1_g_m_device_result
.
mData
.
data
());
bool
c_error
=
ck
::
utils
::
check_err
(
c_g_m_n_device_result
.
mData
,
c_g_m_n_host_result
.
mData
);
bool
d0_error
=
ck
::
utils
::
check_err
(
d0_g_m_device_result
.
mData
,
d0_g_m_host_result
.
mData
);
bool
d1_error
=
ck
::
utils
::
check_err
(
d1_g_m_device_result
.
mData
,
d1_g_m_host_result
.
mData
);
bool
c_error
=
ck
::
utils
::
check_err
(
c_g_m_n_device_result
,
c_g_m_n_host_result
);
bool
d0_error
=
ck
::
utils
::
check_err
(
d0_g_m_device_result
,
d0_g_m_host_result
);
bool
d1_error
=
ck
::
utils
::
check_err
(
d1_g_m_device_result
,
d1_g_m_host_result
);
pass
=
pass
&&
(
c_error
==
true
);
pass
=
pass
&&
(
d0_error
==
true
);
...
...
profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp
View file @
95a83c6e
...
...
@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
...
...
@@ -29,7 +30,8 @@ template <typename ADataType,
typename
ALayout
,
typename
B0Layout
,
typename
B1Layout
,
typename
CLayout
>
typename
CLayout
,
bool
MaskOutUpperTriangle
>
bool
profile_batched_gemm_softmax_gemm_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
...
...
@@ -46,16 +48,18 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
int
BatchStrideA
=
-
1
,
int
BatchStrideB0
=
-
1
,
int
BatchStrideB1
=
-
1
,
int
BatchStrideC
=
-
1
)
int
BatchStrideC
=
-
1
,
float
alpha
=
1.
f
)
{
using
Row
=
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
tensor_operation
::
element_wise
::
Scale
;
using
AElementOp
=
PassThrough
;
using
B0ElementOp
=
PassThrough
;
using
Acc0ElementOp
=
PassThrough
;
using
Acc0ElementOp
=
Scale
;
using
B1ElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
using
AccDataType
=
float
;
...
...
@@ -67,7 +71,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
AccDataType
,
AElementOp
,
B0ElementOp
,
C
ElementOp
>
;
Acc0
ElementOp
>
;
// Ref Softmax: fp32 in, various type out
using
ReferenceSoftmaxInstance
=
...
...
@@ -110,15 +114,15 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -185,7 +189,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
auto
a_element_op
=
AElementOp
{};
auto
b0_element_op
=
B0ElementOp
{};
auto
acc0_element_op
=
Acc0ElementOp
{};
auto
acc0_element_op
=
Acc0ElementOp
{
alpha
};
auto
b1_element_op
=
B1ElementOp
{};
auto
c_element_op
=
CElementOp
{};
...
...
@@ -201,7 +205,8 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
B0ElementOp
,
Acc0ElementOp
,
B1ElementOp
,
CElementOp
>
;
CElementOp
,
MaskOutUpperTriangle
>
;
// get device op instances
const
auto
op_ptrs
=
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
...
@@ -214,10 +219,16 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
auto
ref_gemm0
=
ReferenceGemm0Instance
{};
auto
ref_gemm0_invoker
=
ref_gemm0
.
MakeInvoker
();
auto
ref_gemm0_argument
=
ref_gemm0
.
MakeArgument
(
a_g_m_k
,
b0_g_k_n
,
acc0_g_m_n
,
a_element_op
,
b0_element_op
,
PassThrough
{
});
a_g_m_k
,
b0_g_k_n
,
acc0_g_m_n
,
a_element_op
,
b0_element_op
,
Scale
{
alpha
});
ref_gemm0_invoker
.
Run
(
ref_gemm0_argument
);
// mask out upper triangle
acc0_g_m_n
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
if
(
MaskOutUpperTriangle
&&
idx
[
1
]
<
idx
[
2
])
self
(
idx
)
=
-
ck
::
NumericLimits
<
float
>::
Infinity
();
});
auto
ref_softmax
=
ReferenceSoftmaxInstance
{};
auto
ref_softmax_invoker
=
ref_softmax
.
MakeInvoker
();
auto
ref_softmax_argument
=
ref_softmax
.
MakeArgument
(
acc0_g_m_n
,
a1_g_m_n
,
1
,
0
,
{
2
});
...
...
@@ -297,8 +308,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
{
c_g_m_o_device_buf
.
FromDevice
(
c_g_m_o_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
.
mData
,
c_g_m_o_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_g_m_o_device_result
,
c_g_m_o_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_batched_gemm_
masking_scale_
softmax_gemm_permute_impl.hpp
→
profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp
View file @
95a83c6e
...
...
@@ -7,51 +7,48 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_batched_gemm_softmax_gemm_permute
_xdl_cshuffle
.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_
masking_scale_
softmax_gemm_permute.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
ADataType
,
template
<
index_t
NumDimG
,
index_t
NumDimM
,
index_t
NumDimN
,
index_t
NumDimK
,
index_t
NumDimO
,
typename
ADataType
,
typename
B0DataType
,
typename
B1DataType
,
typename
CDataType
,
typename
ALayout
,
typename
B0Layout
,
typename
B1Layout
,
typename
CPermuteNumDims_G_M_O
>
bool
profile_batched_gemm_masking_scale_softmax_gemm_permute_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
int
M
,
int
N
,
int
K
,
int
O
,
int
G0
,
int
G1
,
int
StrideA
=
-
1
,
int
StrideB0
=
-
1
,
int
StrideB1
=
-
1
,
int
BatchStrideA
=
-
1
,
int
BatchStrideB0
=
-
1
,
int
BatchStrideB1
=
-
1
,
float
alpha
=
1.
f
)
typename
Acc0BiasesDataType
,
typename
Acc1BiasesDataType
,
tensor_operation
::
device
::
MaskingSpecialization
MaskingSpec
>
bool
profile_batched_gemm_softmax_gemm_permute_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
int
M
,
int
N
,
int
K
,
int
O
,
int
G0
,
int
G1
,
float
alpha
=
1.
f
)
{
using
Row
=
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
tensor_operation
::
element_wise
::
Scale
;
using
AElementOp
=
PassThrough
;
...
...
@@ -60,6 +57,7 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
using
B1ElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
using
AccDataType
=
float
;
using
tensor_operation
::
device
::
MaskingSpecialization
;
// Ref Gemm0: various type in, fp32 out
using
ReferenceGemm0Instance
=
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
...
...
@@ -85,67 +83,33 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
bool
pass
=
true
;
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_lengths
{
G0
,
G1
,
M
,
O
};
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_strides
{
M
*
G1
*
O
,
O
,
G1
*
O
,
1
};
const
int
DefaultStrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
DefaultStrideB0
=
ck
::
is_same_v
<
B0Layout
,
Row
>
?
N
:
K
;
const
int
DefaultStrideB1
=
ck
::
is_same_v
<
B1Layout
,
Row
>
?
O
:
N
;
// A layout [G0, M, G1, K]
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_lengths
{
G0
,
G1
,
M
,
K
};
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_strides
{
M
*
G1
*
K
,
K
,
G1
*
K
,
1
};
StrideA
=
(
StrideA
<
0
)
?
DefaultStrideA
:
StrideA
;
StrideB0
=
(
StrideB0
<
0
)
?
DefaultStrideB0
:
StrideB0
;
StrideB1
=
(
StrideB1
<
0
)
?
DefaultStrideB1
:
StrideB1
;
// B0 layout [G0, N, G1, K]
std
::
vector
<
ck
::
index_t
>
b0_gs_ns_ks_lengths
{
G0
,
G1
,
N
,
K
}
;
std
::
vector
<
ck
::
index_t
>
b0_gs_ns_ks_strides
{
N
*
G1
*
K
,
K
,
G1
*
K
,
1
}
;
const
int
DefaultBatchStrideA
=
(
ck
::
is_same_v
<
ALayout
,
Col
>
?
K
:
M
)
*
StrideA
;
const
int
DefaultBatchStrideB0
=
(
ck
::
is_same_v
<
B0Layout
,
Col
>
?
N
:
K
)
*
StrideB0
;
const
int
DefaultBatchStrideB1
=
(
ck
::
is_same_v
<
B1Layout
,
Col
>
?
O
:
N
)
*
StrideB1
;
// B1 layout [G0, N, G1, O]
std
::
vector
<
ck
::
index_t
>
b1_gs_os_ns_lengths
{
G0
,
G1
,
O
,
N
}
;
std
::
vector
<
ck
::
index_t
>
b1_gs_os_ns_strides
{
N
*
G1
*
O
,
O
,
1
,
G1
*
O
}
;
BatchStrideA
=
BatchStrideA
<
0
?
DefaultBatchStrideA
:
BatchStrideA
;
BatchStrideB0
=
BatchStrideB0
<
0
?
DefaultBatchStrideB0
:
BatchStrideB0
;
BatchStrideB1
=
BatchStrideB1
<
0
?
DefaultBatchStrideB1
:
BatchStrideB1
;
// C layout [G0, M, G1, O]
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_lengths
{
G0
,
G1
,
M
,
O
}
;
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_strides
{
M
*
G1
*
O
,
O
,
G1
*
O
,
1
}
;
const
int
BatchCount
=
G0
*
G1
;
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count
,
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
}
};
// C_m_o = A_m_k * B0_k_n * B1_n_o
Tensor
<
ADataType
>
a_g_m_k
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
K
,
StrideA
,
BatchStrideA
,
ALayout
{}));
Tensor
<
B0DataType
>
b0_g_k_n
(
f_host_tensor_descriptor
(
BatchCount
,
K
,
N
,
StrideB0
,
BatchStrideB0
,
B0Layout
{}));
Tensor
<
B1DataType
>
b1_g_n_o
(
f_host_tensor_descriptor
(
BatchCount
,
N
,
O
,
StrideB1
,
BatchStrideB1
,
B1Layout
{}));
Tensor
<
CDataType
>
c_gs_ms_os_host_result
(
std
::
vector
<
std
::
size_t
>
(
c_gs_ms_os_lengths
.
begin
(),
c_gs_ms_os_lengths
.
end
()),
std
::
vector
<
std
::
size_t
>
(
c_gs_ms_os_strides
.
begin
(),
c_gs_ms_os_strides
.
end
()));
Tensor
<
CDataType
>
c_gs_ms_os_device_result
(
std
::
vector
<
std
::
size_t
>
(
c_gs_ms_os_lengths
.
begin
(),
c_gs_ms_os_lengths
.
end
()),
std
::
vector
<
std
::
size_t
>
(
c_gs_ms_os_strides
.
begin
(),
c_gs_ms_os_strides
.
end
()));
// Host verification: Output of Gemm0 is input A of Gemm1
Tensor
<
AccDataType
>
acc0_g_m_n
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
N
,
M
*
N
,
Row
{}));
Tensor
<
ADataType
>
a1_g_m_n
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
N
,
M
*
N
,
Row
{}));
Tensor
<
CDataType
>
c_g_m_o_host_result
(
std
::
vector
<
int
>
{
BatchCount
,
M
,
O
},
std
::
vector
<
int
>
{
M
*
O
,
O
,
1
});
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b0_g_k_n: "
<<
b0_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b1_g_n_o: "
<<
b1_g_n_o
.
mDesc
<<
std
::
endl
;
Tensor
<
ADataType
>
a_gs_ms_ks
(
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
);
Tensor
<
B0DataType
>
b0_gs_ns_ks
(
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_strides
);
Tensor
<
B1DataType
>
b1_gs_os_ns
(
b1_gs_os_ns_lengths
,
b1_gs_os_ns_strides
);
Tensor
<
CDataType
>
c_gs_ms_os_host_result
(
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
);
Tensor
<
CDataType
>
c_gs_ms_os_device_result
(
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
);
std
::
cout
<<
"a_gs_ms_ks: "
<<
a_gs_ms_ks
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b0_gs_ns_ks: "
<<
b0_gs_ns_ks
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b1_gs_os_ns: "
<<
b1_gs_os_ns
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_gs_ms_os: "
<<
c_gs_ms_os_host_result
.
mDesc
<<
std
::
endl
;
std
::
srand
(
1
);
// work around test flakiness
...
...
@@ -157,38 +121,38 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
// or not. May want to try exact same approach as the GPU kernel in the host reference
// GEMM+Softmax+GEMM function to see if the accuracy discrepancy goes away. Until then,
// shrink the input value range as it is less likely to produce errors of around ~1e-3.
// a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
// b0_g
_k_n
.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
// b1_g
_n_o
.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_g
_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
B0DataType
>
{
-
2
,
2
});
b1_g
_n_o
.
GenerateTensorValue
(
GeneratorTensor_2
<
B1DataType
>
{
-
2
,
2
});
// a_g
s
_m
s
_k
s
.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
// b0_g
s_ns_ks
.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
// b1_g
s_os_ns
.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
a_g
s
_m
s
_k
s
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_g
s_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
B0DataType
>
{
-
2
,
2
});
b1_g
s_os_ns
.
GenerateTensorValue
(
GeneratorTensor_2
<
B1DataType
>
{
-
2
,
2
});
break
;
case
2
:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b0_g
_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
B0DataType
>
{
0.0
,
1.0
});
b1_g
_n_o
.
GenerateTensorValue
(
GeneratorTensor_3
<
B1DataType
>
{
-
0.5
,
0.5
});
a_g
s
_m
s
_k
s
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b0_g
s_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_3
<
B0DataType
>
{
0.0
,
1.0
});
b1_g
s_os_ns
.
GenerateTensorValue
(
GeneratorTensor_3
<
B1DataType
>
{
-
0.5
,
0.5
});
break
;
case
3
:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_g
_k_n
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B0DataType
>
{});
b1_g
_n_o
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
a_g
s
_m
s
_k
s
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_g
s_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B0DataType
>
{});
b1_g
s_os_ns
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
break
;
default:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{
1
});
b0_g
_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
b1_g
_n_o
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
a_g
s
_m
s
_k
s
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{
1
});
b0_g
s_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
b1_g
s_os_ns
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
}
DeviceMem
a_
g_m_k_
device_buf
(
sizeof
(
ADataType
)
*
a_g_m_k
.
mDesc
.
GetElementSize
());
DeviceMem
b0_
g_k_n_
device_buf
(
sizeof
(
B0DataType
)
*
b0_g
_k_n
.
mDesc
.
GetElementSize
());
DeviceMem
b1_
g_n_o_
device_buf
(
sizeof
(
B1DataType
)
*
b1_g
_n_o
.
mDesc
.
GetElementSize
());
DeviceMem
c_
gs_ms_os_
device_buf
(
sizeof
(
CDataType
)
*
c_gs_ms_os_device_result
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_g
s
_m
s
_k
s
.
mDesc
.
GetElementS
paceS
ize
());
DeviceMem
b0_device_buf
(
sizeof
(
B0DataType
)
*
b0_g
s_ns_ks
.
mDesc
.
GetElementS
paceS
ize
());
DeviceMem
b1_device_buf
(
sizeof
(
B1DataType
)
*
b1_g
s_os_ns
.
mDesc
.
GetElementS
paceS
ize
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_gs_ms_os_device_result
.
mDesc
.
GetElementSpaceSize
());
a_
g_m_k_
device_buf
.
ToDevice
(
a_g_m_k
.
mData
.
data
());
b0_
g_k_n_
device_buf
.
ToDevice
(
b0_g
_k_n
.
mData
.
data
());
b1_
g_n_o_
device_buf
.
ToDevice
(
b1_g
_n_o
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a_g
s
_m
s
_k
s
.
mData
.
data
());
b0_device_buf
.
ToDevice
(
b0_g
s_ns_ks
.
mData
.
data
());
b1_device_buf
.
ToDevice
(
b1_g
s_os_ns
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
b0_element_op
=
B0ElementOp
{};
...
...
@@ -196,20 +160,23 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
auto
b1_element_op
=
B1ElementOp
{};
auto
c_element_op
=
CElementOp
{};
using
DeviceOp
=
tensor_operation
::
device
::
DeviceBatchedGemmSoftmaxGemmPermute
<
ALayout
,
B0Layout
,
B1Layout
,
CPermuteNumDims_G_M_O
,
ADataType
,
B0DataType
,
B1DataType
,
CDataType
,
AElementOp
,
B0ElementOp
,
Acc0ElementOp
,
B1ElementOp
,
CElementOp
>
;
using
DeviceOp
=
tensor_operation
::
device
::
DeviceBatchedGemmSoftmaxGemmPermute
<
2
,
1
,
1
,
1
,
1
,
ADataType
,
B0DataType
,
B1DataType
,
CDataType
,
ck
::
Tuple
<>
,
ck
::
Tuple
<>
,
AElementOp
,
B0ElementOp
,
Acc0ElementOp
,
B1ElementOp
,
CElementOp
,
MaskingSpec
>
;
// get device op instances
const
auto
op_ptrs
=
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
...
@@ -219,6 +186,26 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
c_gs_ms_os_device_result
.
mData
.
data
());
Tensor
<
ADataType
>
a_g_m_k
({
BatchCount
,
M
,
K
});
Tensor
<
B0DataType
>
b0_g_k_n
({
BatchCount
,
K
,
N
});
Tensor
<
B1DataType
>
b1_g_n_o
({
BatchCount
,
N
,
O
});
Tensor
<
AccDataType
>
acc0_g_m_n
({
BatchCount
,
M
,
N
});
// scratch object after gemm0
Tensor
<
ADataType
>
a1_g_m_n
({
BatchCount
,
M
,
N
});
// scratch object after softmax
Tensor
<
CDataType
>
c_g_m_o_host_result
({
BatchCount
,
M
,
O
});
// scratch object after gemm1
// permute
a_gs_ms_ks
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
a_g_m_k
(
idx
[
0
]
*
G1
+
idx
[
1
],
idx
[
2
],
idx
[
3
])
=
self
(
idx
);
});
b0_gs_ns_ks
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
b0_g_k_n
(
idx
[
0
]
*
G1
+
idx
[
1
],
idx
[
3
],
idx
[
2
])
=
self
(
idx
);
});
b1_gs_os_ns
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
b1_g_n_o
(
idx
[
0
]
*
G1
+
idx
[
1
],
idx
[
3
],
idx
[
2
])
=
self
(
idx
);
});
auto
ref_gemm0
=
ReferenceGemm0Instance
{};
auto
ref_gemm0_invoker
=
ref_gemm0
.
MakeInvoker
();
auto
ref_gemm0_argument
=
ref_gemm0
.
MakeArgument
(
...
...
@@ -228,7 +215,7 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
// mask out upper triangle
acc0_g_m_n
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
if
(
idx
[
1
]
<
idx
[
2
])
if
(
MaskingSpec
==
MaskingSpecialization
::
MaskOutUpperTriangle
&&
idx
[
1
]
<
idx
[
2
])
self
(
idx
)
=
-
ck
::
NumericLimits
<
float
>::
Infinity
();
});
...
...
@@ -265,23 +252,24 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_g_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
B0DataType
*>
(
b0_g_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
B1DataType
*>
(
b1_g_n_o_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_gs_ms_os_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
O
,
BatchCount
,
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
B0DataType
*>
(
b0_device_buf
.
GetDeviceBuffer
()),
static_cast
<
B1DataType
*>
(
b1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
{},
// std::array<void*, 1> p_acc0_biases;
{},
// std::array<void*, 1> p_acc1_biases;
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
,
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_strides
,
b1_gs_os_ns_lengths
,
b1_gs_os_ns_strides
,
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
,
StrideA
,
StrideB0
,
StrideB1
,
BatchStrideA
,
BatchStrideB0
,
BatchStrideB1
,
{},
// std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_lengths},
{},
// std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_strides},
{},
// std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_lengths},
{},
// std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_strides},
a_element_op
,
b0_element_op
,
acc0_element_op
,
...
...
@@ -319,18 +307,35 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
if
(
do_verification
)
{
c_gs_ms_os_device_buf
.
FromDevice
(
c_gs_ms_os_device_result
.
mData
.
data
());
c_device_buf
.
FromDevice
(
c_gs_ms_os_device_result
.
mData
.
data
());
// default absolute error and relative error is 0.001
double
rtol
=
1e-3
;
double
atol
=
1e-3
;
// when BF16 is taken, set absolute error and relative error to 0.01
if
(
std
::
is_same_v
<
ADataType
,
ck
::
bhalf_t
>
&&
std
::
is_same_v
<
B0DataType
,
ck
::
bhalf_t
>
&&
std
::
is_same_v
<
B1DataType
,
ck
::
bhalf_t
>
&&
std
::
is_same_v
<
CDataType
,
ck
::
bhalf_t
>
)
{
rtol
=
1e-2
;
atol
=
1e-2
;
}
pass
=
pass
&
ck
::
utils
::
check_err
(
c_gs_ms_os_device_result
.
mData
,
c_gs_ms_os_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
c_gs_ms_os_device_result
,
c_gs_ms_os_host_result
,
"Error: Incorrect results!"
,
rtol
,
atol
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a_g_m_k: "
,
a_g_m_k
.
mData
,
","
)
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a_g
s
_m
s
_k
s
: "
,
a_g
s
_m
s
_k
s
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b0_g
_k_n
: "
,
b0_g
_k_n
.
mData
,
","
)
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b0_g
s_ns_ks
: "
,
b0_g
s_ns_ks
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b1_g
_n_o
: "
,
b1_g
_n_o
.
mData
,
","
)
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b1_g
s_os_ns
: "
,
b1_g
s_os_ns
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_gs_ms_os_host_result : "
,
c_gs_ms_os_host_result
.
mData
,
","
)
...
...
profiler/include/profile_conv_bwd_data_impl.hpp
View file @
95a83c6e
...
...
@@ -209,8 +209,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
{
in_device_buf
.
FromDevice
(
input_device_result
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
input_device_result
.
mData
,
input_host_result
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
input_device_result
,
input_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp
View file @
95a83c6e
...
...
@@ -12,6 +12,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
namespace
ck
{
...
...
@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
KCYX
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NKHW
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
H
*
W
,
W
,
1
}));
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
KYXC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWK
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
1
_uz
,
W
*
C_
,
C_
});
}
};
...
...
@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
// bias: assume contiguous 1d vector
Tensor
<
OutDataType
>
bias_k
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
K
)})));
Tensor
<
OutDataType
>
bias_k
({
K
});
// residual: assume same layout as output tensor
Tensor
<
OutDataType
>
resi_n_k_ho_wo
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
...
...
@@ -251,8 +251,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
,
out_n_k_ho_wo_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_bias_relu_impl.hpp
View file @
95a83c6e
...
...
@@ -12,6 +12,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
namespace
ck
{
...
...
@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCHW
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
KCYX
>::
value
||
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NKHW
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
H
*
W
,
W
,
1
}));
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
KYXC
>::
value
||
is_same
<
decltype
(
layout
),
tensor_layout
::
convolution
::
NHWK
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
1
_uz
,
W
*
C_
,
C_
});
}
};
...
...
@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
,
OutLayout
{}));
// bias: assume contiguous 1d vector
Tensor
<
OutDataType
>
bias_k
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
K
)})));
Tensor
<
OutDataType
>
bias_k
({
K
});
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
...
...
@@ -239,8 +239,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
.
mData
,
out_n_k_ho_wo_host_result
.
mData
);
ck
::
utils
::
check_err
(
out_n_k_ho_wo_device_result
,
out_n_k_ho_wo_host_result
);
if
(
do_log
)
{
...
...
profiler/include/profile_conv_fwd_impl.hpp
View file @
95a83c6e
...
...
@@ -191,7 +191,7 @@ bool profile_conv_fwd_impl(int do_verification,
{
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
pass
=
pass
&
ck
::
utils
::
check_err
(
device_output
.
mData
,
host_output
.
mData
);
pass
=
pass
&
ck
::
utils
::
check_err
(
device_output
,
host_output
);
if
(
do_log
)
{
...
...
profiler/include/profile_convnd_bwd_data_impl.hpp
View file @
95a83c6e
...
...
@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std
::
cout
<<
"Pass Info: "
<<
conv_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
success
=
ck
::
utils
::
check_err
(
input_host_result
.
mData
,
input_device_result
.
mData
);
success
=
ck
::
utils
::
check_err
(
input_host_result
,
input_device_result
);
if
(
do_log
)
{
...
...
Prev
1
…
24
25
26
27
28
29
30
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment