Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4100d1d8
Commit
4100d1d8
authored
Aug 23, 2023
by
Alan Turner
Browse files
Merge remote-tracking branch 'origin/develop' into migx-flash-attn
parents
48717006
c8a8385f
Changes
609
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
105 additions
and
226 deletions
+105
-226
library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
...u/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+3
-1
library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
...u/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
+3
-1
library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
...u/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
+6
-2
library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
...u/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
+6
-2
library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
...tion_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp
+41
-0
library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
...src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
+0
-10
library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
.../gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+0
-23
library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
.../gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+0
-23
library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
.../gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
+0
-30
library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
.../gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
+0
-30
library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
...tensor_operation_instance/gpu/quantization/CMakeLists.txt
+17
-23
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
...uantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
+1
-1
library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
.../src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+10
-16
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
..._instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+0
-40
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
...softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
+3
-4
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
...softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
+3
-4
No files found.
library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
→
library/src/tensor_operation_instance/gpu/pool
3d
_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
View file @
4100d1d8
...
@@ -11,7 +11,9 @@ namespace instance {
...
@@ -11,7 +11,9 @@ namespace instance {
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
void
add_device_pool3d_fwd_ndhwc_f16_instances
(
void
add_device_pool3d_fwd_ndhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F16
,
F16
,
I32
,
ReduceOpId
,
false
>>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F16
,
F16
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
false
>>>&
instances
)
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
device_pool3d_fwd_ndhwc_instances
<
F16
,
F16
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
instances
,
device_pool3d_fwd_ndhwc_instances
<
F16
,
F16
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
...
...
library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
→
library/src/tensor_operation_instance/gpu/pool
3d
_fwd/device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
View file @
4100d1d8
...
@@ -11,7 +11,9 @@ namespace instance {
...
@@ -11,7 +11,9 @@ namespace instance {
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
void
add_device_pool3d_fwd_ndhwc_f32_instances
(
void
add_device_pool3d_fwd_ndhwc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F32
,
F32
,
I32
,
ReduceOpId
,
false
>>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F32
,
F32
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
false
>>>&
instances
)
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
device_pool3d_fwd_ndhwc_instances
<
F32
,
F32
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
instances
,
device_pool3d_fwd_ndhwc_instances
<
F32
,
F32
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
...
...
library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
→
library/src/tensor_operation_instance/gpu/pool
3d
_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp
View file @
4100d1d8
...
@@ -11,14 +11,18 @@ namespace instance {
...
@@ -11,14 +11,18 @@ namespace instance {
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
void
add_device_pool3d_fwd_ndhwc_f16_instances
(
void
add_device_pool3d_fwd_ndhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F16
,
F16
,
I32
,
ReduceOpId
,
false
>>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F16
,
F16
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
false
>>>&
instances
)
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
device_pool3d_fwd_ndhwc_instances
<
F16
,
F16
,
I32
,
F16
,
ReduceOpId
,
false
>
{});
instances
,
device_pool3d_fwd_ndhwc_instances
<
F16
,
F16
,
I32
,
F16
,
ReduceOpId
,
false
>
{});
}
}
void
add_device_pool3d_fwd_ndhwc_index_f16_instances
(
void
add_device_pool3d_fwd_ndhwc_index_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F16
,
F16
,
I32
,
ReduceOpId
,
true
>>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F16
,
F16
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
true
>>>&
instances
)
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
device_pool3d_fwd_ndhwc_instances
<
F16
,
F16
,
I32
,
F16
,
ReduceOpId
,
true
>
{});
instances
,
device_pool3d_fwd_ndhwc_instances
<
F16
,
F16
,
I32
,
F16
,
ReduceOpId
,
true
>
{});
...
...
library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
→
library/src/tensor_operation_instance/gpu/pool
3d
_fwd/device_max_pool3d_fwd_ndhwc_f32_instance.cpp
View file @
4100d1d8
...
@@ -11,14 +11,18 @@ namespace instance {
...
@@ -11,14 +11,18 @@ namespace instance {
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
void
add_device_pool3d_fwd_ndhwc_f32_instances
(
void
add_device_pool3d_fwd_ndhwc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F32
,
F32
,
I32
,
ReduceOpId
,
false
>>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F32
,
F32
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
false
>>>&
instances
)
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
device_pool3d_fwd_ndhwc_instances
<
F32
,
F32
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
instances
,
device_pool3d_fwd_ndhwc_instances
<
F32
,
F32
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
}
}
void
add_device_pool3d_fwd_ndhwc_index_f32_instances
(
void
add_device_pool3d_fwd_ndhwc_index_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F32
,
F32
,
I32
,
ReduceOpId
,
true
>>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
5
,
3
,
F32
,
F32
,
I32
,
NDHWC
,
NDHWC
,
ReduceOpId
,
true
>>>&
instances
)
{
{
add_device_operation_instances
(
add_device_operation_instances
(
instances
,
device_pool3d_fwd_ndhwc_instances
<
F32
,
F32
,
I32
,
F32
,
ReduceOpId
,
true
>
{});
instances
,
device_pool3d_fwd_ndhwc_instances
<
F32
,
F32
,
I32
,
F32
,
ReduceOpId
,
true
>
{});
...
...
library/src/tensor_operation_instance/gpu/pool_fwd/pool_fwd_instance_common.hpp
→
library/src/tensor_operation_instance/gpu/pool
3d
_fwd/pool_fwd_instance_common.hpp
View file @
4100d1d8
...
@@ -15,24 +15,10 @@ namespace tensor_operation {
...
@@ -15,24 +15,10 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
using
I32
=
int32_t
;
using
I32
=
int32_t
;
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
using
NDHWC
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
template
<
typename
InDataType
,
typename
OutDataType
,
typename
IndexDataType
,
typename
ComputeDataType
,
ReduceTensorOp
ReduceOpId
,
bool
OutputIndex
>
using
device_pool2d_fwd_nhwc_instances
=
// clang-format off
std
::
tuple
<
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
256
,
256
,
1
,
1
,
1
,
1
>
,
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
256
,
256
,
1
,
2
,
1
,
2
>
,
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
256
,
256
,
1
,
4
,
1
,
4
>
// clang-format on
>
;
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
OutDataType
,
typename
OutDataType
,
...
@@ -43,9 +29,9 @@ template <typename InDataType,
...
@@ -43,9 +29,9 @@ template <typename InDataType,
using
device_pool3d_fwd_ndhwc_instances
=
using
device_pool3d_fwd_ndhwc_instances
=
// clang-format off
// clang-format off
std
::
tuple
<
std
::
tuple
<
DevicePool3dFwd_
Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_
C
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
256
,
256
,
1
,
1
,
1
,
1
>
,
DevicePool3dFwd_
NDHWC_NDHW
C
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
256
,
256
,
1
,
1
,
1
,
1
>
,
DevicePool3dFwd_
Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_
C
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
256
,
256
,
1
,
2
,
1
,
2
>
,
DevicePool3dFwd_
NDHWC_NDHW
C
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
256
,
256
,
1
,
2
,
1
,
2
>
,
DevicePool3dFwd_
Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_
C
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
256
,
256
,
1
,
4
,
1
,
4
>
DevicePool3dFwd_
NDHWC_NDHW
C
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
256
,
256
,
1
,
4
,
1
,
4
>
// clang-format on
// clang-format on
>
;
>
;
...
...
library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
deleted
100644 → 0
View file @
48717006
add_instance_library
(
device_pool_fwd_instance
device_avg_pool2d_fwd_nhwc_f16_instance.cpp
device_avg_pool2d_fwd_nhwc_f32_instance.cpp
device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
device_max_pool2d_fwd_nhwc_f16_instance.cpp
device_max_pool2d_fwd_nhwc_f32_instance.cpp
device_max_pool3d_fwd_ndhwc_f16_instance.cpp
device_max_pool3d_fwd_ndhwc_f32_instance.cpp
)
library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
void
add_device_pool2d_fwd_nhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
4
,
2
,
F16
,
F16
,
I32
,
ReduceOpId
,
false
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_pool2d_fwd_nhwc_instances
<
F16
,
F16
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f32_instance.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
void
add_device_pool2d_fwd_nhwc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
4
,
2
,
F32
,
F32
,
I32
,
ReduceOpId
,
false
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_pool2d_fwd_nhwc_instances
<
F32
,
F32
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f16_instance.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
void
add_device_pool2d_fwd_nhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
4
,
2
,
F16
,
F16
,
I32
,
ReduceOpId
,
false
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_pool2d_fwd_nhwc_instances
<
F16
,
F16
,
I32
,
F16
,
ReduceOpId
,
false
>
{});
}
void
add_device_pool2d_fwd_nhwc_index_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
4
,
2
,
F16
,
F16
,
I32
,
ReduceOpId
,
true
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_pool2d_fwd_nhwc_instances
<
F16
,
F16
,
I32
,
F16
,
ReduceOpId
,
true
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/pool_fwd/device_max_pool2d_fwd_nhwc_f32_instance.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "pool_fwd_instance_common.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
void
add_device_pool2d_fwd_nhwc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
4
,
2
,
F32
,
F32
,
I32
,
ReduceOpId
,
false
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_pool2d_fwd_nhwc_instances
<
F32
,
F32
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
}
void
add_device_pool2d_fwd_nhwc_index_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
4
,
2
,
F32
,
F32
,
I32
,
ReduceOpId
,
true
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_pool2d_fwd_nhwc_instances
<
F32
,
F32
,
I32
,
F32
,
ReduceOpId
,
true
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
View file @
4100d1d8
set
(
CONV2D_PERLAYER_QUANT_SRC
if
(
DTYPES MATCHES
"int8"
OR NOT DEFINED DTYPES
)
conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
)
set
(
CONV2D_PERCHANNEL_QUANT_SRC
conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
)
set
(
CONV2D_BIAS_PERLAYER_QUANT_SRC
conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
)
set
(
CONV2D_BIAS_PERCHANNEL_QUANT_SRC
conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
)
set
(
CONV2D_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
)
set
(
CONV2D_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
)
set
(
CONV2D_BIAS_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
)
set
(
CONV2D_BIAS_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
)
set
(
GEMM_QUANT_SRC
set
(
GEMM_QUANT_SRC
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
)
)
if
(
DL_KERNELS
)
list
(
APPEND CONV2D_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
)
list
(
APPEND CONV2D_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
)
list
(
APPEND CONV2D_BIAS_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
)
list
(
APPEND CONV2D_BIAS_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
)
list
(
APPEND GEMM_QUANT_SRC
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
)
endif
()
add_instance_library
(
device_quantization_instance
add_instance_library
(
device_quantization_instance
${
CONV2D_PERLAYER_QUANT_SRC
}
${
CONV2D_PERLAYER_QUANT_SRC
}
...
@@ -36,3 +29,4 @@ add_instance_library(device_quantization_instance
...
@@ -36,3 +29,4 @@ add_instance_library(device_quantization_instance
${
CONV2D_BIAS_PERCHANNEL_QUANT_SRC
}
${
CONV2D_BIAS_PERCHANNEL_QUANT_SRC
}
${
GEMM_QUANT_SRC
}
${
GEMM_QUANT_SRC
}
)
)
endif
()
\ No newline at end of file
library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_int8_instance.hpp
View file @
4100d1d8
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#pragma once
#pragma once
#include "conv2d_quantization_common.hpp"
#include "conv2d_quantization_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
...
library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
View file @
4100d1d8
add_instance_library
(
device_softmax_instance
set
(
DEVICE_SOFTMAX_INSTANCES
)
device_softmax_i8_i8_instance.cpp
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
device_softmax_i8_i8_instance_rank3_reduce1.cpp
list
(
APPEND DEVICE_SOFTMAX_INSTANCES device_softmax_f16_f16_instance_rank3_reduce1.cpp
device_softmax_i8_i8_instance_rank3_reduce2.cpp
device_softmax_i8_i8_instance_rank3_reduce3.cpp
device_softmax_i8_i8_instance_rank4_reduce1.cpp
device_softmax_i8_i8_instance_rank4_reduce2.cpp
device_softmax_i8_i8_instance_rank4_reduce3.cpp
device_softmax_i8_i8_instance_rank4_reduce4.cpp
device_softmax_f16_f16_instance.cpp
device_softmax_f16_f16_instance_rank3_reduce1.cpp
device_softmax_f16_f16_instance_rank3_reduce2.cpp
device_softmax_f16_f16_instance_rank3_reduce2.cpp
device_softmax_f16_f16_instance_rank3_reduce3.cpp
device_softmax_f16_f16_instance_rank3_reduce3.cpp
device_softmax_f16_f16_instance_rank4_reduce1.cpp
device_softmax_f16_f16_instance_rank4_reduce1.cpp
device_softmax_f16_f16_instance_rank4_reduce2.cpp
device_softmax_f16_f16_instance_rank4_reduce2.cpp
device_softmax_f16_f16_instance_rank4_reduce3.cpp
device_softmax_f16_f16_instance_rank4_reduce3.cpp
device_softmax_f16_f16_instance_rank4_reduce4.cpp
device_softmax_f16_f16_instance_rank4_reduce4.cpp
)
device_softmax_f32_f32_instance.cpp
endif
()
device_softmax_f32_f32_instance_rank3_reduce1.cpp
if
(
DTYPES MATCHES
"fp32"
OR NOT DEFINED DTYPES
)
list
(
APPEND DEVICE_SOFTMAX_INSTANCES device_softmax_f32_f32_instance_rank3_reduce1.cpp
device_softmax_f32_f32_instance_rank3_reduce2.cpp
device_softmax_f32_f32_instance_rank3_reduce2.cpp
device_softmax_f32_f32_instance_rank3_reduce3.cpp
device_softmax_f32_f32_instance_rank3_reduce3.cpp
device_softmax_f32_f32_instance_rank4_reduce1.cpp
device_softmax_f32_f32_instance_rank4_reduce1.cpp
device_softmax_f32_f32_instance_rank4_reduce2.cpp
device_softmax_f32_f32_instance_rank4_reduce2.cpp
device_softmax_f32_f32_instance_rank4_reduce3.cpp
device_softmax_f32_f32_instance_rank4_reduce3.cpp
device_softmax_f32_f32_instance_rank4_reduce4.cpp
device_softmax_f32_f32_instance_rank4_reduce4.cpp
)
)
endif
()
add_instance_library
(
device_softmax_instance
${
DEVICE_SOFTMAX_INSTANCES
}
)
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
deleted
100644 → 0
View file @
48717006
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_softmax_f16_f16_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
>>&
instances
)
{
add_device_softmax_f16_f16_rank3_reduce1_instances
(
instances
);
add_device_softmax_f16_f16_rank3_reduce2_instances
(
instances
);
add_device_softmax_f16_f16_rank3_reduce3_instances
(
instances
);
}
void
add_device_softmax_f16_f16_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
>>&
instances
)
{
add_device_softmax_f16_f16_rank4_reduce1_instances
(
instances
);
add_device_softmax_f16_f16_rank4_reduce2_instances
(
instances
);
add_device_softmax_f16_f16_rank4_reduce3_instances
(
instances
);
add_device_softmax_f16_f16_rank4_reduce4_instances
(
instances
);
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
View file @
4100d1d8
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f16_f16_rank3_reduce1_instances
(
void
add_device_softmax_f16_f16_rank3_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
,
1
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
3
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
1
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
View file @
4100d1d8
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f16_f16_rank3_reduce2_instances
(
void
add_device_softmax_f16_f16_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
,
2
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
3
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
2
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
View file @
4100d1d8
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_f16_f16_rank3_reduce3_instances
(
void
add_device_softmax_f16_f16_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
,
3
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
3
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
3
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
View file @
4100d1d8
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce1_instances
(
void
add_device_softmax_f16_f16_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
,
1
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
4
,
1
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
1
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
View file @
4100d1d8
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce2_instances
(
void
add_device_softmax_f16_f16_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
,
2
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
4
,
2
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
2
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
View file @
4100d1d8
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
...
@@ -13,12 +13,11 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_f16_f16_rank4_reduce3_instances
(
void
add_device_softmax_f16_f16_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
,
3
>>&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
RANK
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_generic_instance
<
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_softmax_f16_f16_instances
<
4
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
Prev
1
…
22
23
24
25
26
27
28
29
30
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment