Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
261d3267
Commit
261d3267
authored
Nov 14, 2023
by
Bartlomiej Wroblewski
Browse files
Merge remote-tracking branch 'origin/develop' into bwroblew/direct_loads
parents
2d5b22fe
f2398f61
Changes
372
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
364 additions
and
68 deletions
+364
-68
example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp
example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp
+45
-0
example/27_layernorm2d_fwd/run_layernorm_example.inc
example/27_layernorm2d_fwd/run_layernorm_example.inc
+3
-3
example/30_grouped_conv_fwd_multiple_d/README.md
example/30_grouped_conv_fwd_multiple_d/README.md
+1
-1
example/30_grouped_conv_fwd_multiple_d/common.hpp
example/30_grouped_conv_fwd_multiple_d/common.hpp
+1
-1
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
...multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
+1
-1
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
...uped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
+1
-1
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
...conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
+2
-2
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
...n/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
+2
-2
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
...ntization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
+2
-2
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
...uantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+2
-2
example/42_groupnorm/CMakeLists.txt
example/42_groupnorm/CMakeLists.txt
+0
-3
example/42_groupnorm/groupnorm_swish_fp16.cpp
example/42_groupnorm/groupnorm_swish_fp16.cpp
+0
-45
example/42_groupnorm_fwd/CMakeLists.txt
example/42_groupnorm_fwd/CMakeLists.txt
+3
-0
example/42_groupnorm_fwd/common.hpp
example/42_groupnorm_fwd/common.hpp
+2
-2
example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp
example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp
+65
-0
example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp
example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp
+45
-0
example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp
example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp
+45
-0
example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
+3
-3
example/44_elementwise_permute/CMakeLists.txt
example/44_elementwise_permute/CMakeLists.txt
+6
-0
example/44_elementwise_permute/elementwise_permute.cpp
example/44_elementwise_permute/elementwise_permute.cpp
+135
-0
No files found.
example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp
0 → 100644
View file @
261d3267
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
float
;
using
ComputeDataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
#define SAVE_MEAN_INV_STD
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationFwdSplitKImpl
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
PassThrough
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
8
,
// SliceK
1
,
// XYVectorDim (0=M, 1=K)
8
,
// XScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
8
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
8
,
// BetaScalarPerVector
8
,
// YScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
#include "run_layernorm_example.inc"
int
main
()
{
return
run_layernorm2d_fwd_example
<
DeviceInstance
>
();
}
example/27_layernorm/run_layernorm_example.inc
→
example/27_layernorm
2d_fwd
/run_layernorm_example.inc
View file @
261d3267
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#pragma once
#pragma once
template
<
typename
DeviceInstance
>
template
<
typename
DeviceInstance
>
int
run_
groupnorm
_example
()
int
run_
layernorm2d_fwd
_example
()
{
{
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
...
@@ -44,9 +44,9 @@ int run_groupnorm_example()
...
@@ -44,9 +44,9 @@ int run_groupnorm_example()
{
0
,
1
},
{
0
,
1
},
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
()
.
begin
(),
y
.
mDesc
.
GetStrides
()
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
()
.
begin
(),
y
.
mDesc
.
GetStrides
()
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
{
1
},
{
1
},
1
e
-
4
,
1
e
-
4
,
x_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
...
...
example/30_grouped_conv_fwd_multiple_d/README.md
View file @
261d3267
...
@@ -26,5 +26,5 @@ out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
...
@@ -26,5 +26,5 @@ out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Warm up 1 time
Start running 10 times...
Start running 10 times...
Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default>
Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle<256, 128, 256, 16, Default>
```
```
example/30_grouped_conv_fwd_multiple_d/common.hpp
View file @
261d3267
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
...
...
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
View file @
261d3267
...
@@ -34,7 +34,7 @@ using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayo
...
@@ -34,7 +34,7 @@ using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayo
template
<
ck
::
index_t
NDimSpatial
>
template
<
ck
::
index_t
NDimSpatial
>
using
DeviceConvFwdInstance
=
using
DeviceConvFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InputLayout
<
NDimSpatial
>
,
InputLayout
<
NDimSpatial
>
,
WeightLayout
<
NDimSpatial
>
,
WeightLayout
<
NDimSpatial
>
,
...
...
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
View file @
261d3267
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
template
<
ck
::
index_t
NDimSpatial
>
template
<
ck
::
index_t
NDimSpatial
>
using
DeviceConvFwdInstance
=
using
DeviceConvFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InputLayout
<
NDimSpatial
>
,
InputLayout
<
NDimSpatial
>
,
WeightLayout
<
NDimSpatial
>
,
WeightLayout
<
NDimSpatial
>
,
...
...
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
View file @
261d3267
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
using
InDataType
=
int8_t
;
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
@@ -33,7 +33,7 @@ template <ck::index_t NDimSpatial,
...
@@ -33,7 +33,7 @@ template <ck::index_t NDimSpatial,
typename
RequantScaleLayout
,
typename
RequantScaleLayout
,
typename
OutLayout
>
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
View file @
261d3267
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
using
InDataType
=
int8_t
;
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
@@ -31,7 +31,7 @@ template <ck::index_t NDimSpatial,
...
@@ -31,7 +31,7 @@ template <ck::index_t NDimSpatial,
typename
BiasLayout
,
typename
BiasLayout
,
typename
OutLayout
>
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp
View file @
261d3267
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
using
InDataType
=
int8_t
;
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
@@ -31,7 +31,7 @@ template <ck::index_t NDimSpatial,
...
@@ -31,7 +31,7 @@ template <ck::index_t NDimSpatial,
typename
RequantScaleLayout
,
typename
RequantScaleLayout
,
typename
OutLayout
>
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
View file @
261d3267
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
using
InDataType
=
int8_t
;
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
...
@@ -26,7 +26,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
...
@@ -26,7 +26,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/42_groupnorm/CMakeLists.txt
deleted
100644 → 0
View file @
2d5b22fe
add_example_executable
(
example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp
)
add_example_executable
(
example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp
)
add_example_executable
(
example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp
)
example/42_groupnorm/groupnorm_swish_fp16.cpp
deleted
100644 → 0
View file @
2d5b22fe
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
constexpr
int
Rank
=
5
;
constexpr
int
NumReduceDim
=
3
;
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
float
;
using
ComputeDataType
=
float
;
using
YElementOp
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
#define SAVE_MEAN_INV_STD
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
YElementOp
,
Rank
,
NumReduceDim
,
1024
,
// BlockSize
1
,
// ClusterM
1024
,
// ClusterK
1
,
// SliceM
32
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
2
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
2
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
2
,
// BetaScalarPerVector
2
,
// YScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
#include "run_groupnorm_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
run_groupnorm_example
(
argc
,
argv
);
}
example/42_groupnorm_fwd/CMakeLists.txt
0 → 100644
View file @
261d3267
add_example_executable
(
example_groupnorm_fwd_sigmoid_mul_fp16 groupnorm_fwd_sigmoid_mul_fp16.cpp
)
add_example_executable
(
example_groupnorm_fwd_splitk_fp16 groupnorm_fwd_splitk_fp16.cpp
)
add_example_executable
(
example_groupnorm_fwd_swish_fp16 groupnorm_fwd_swish_fp16.cpp
)
example/42_groupnorm/common.hpp
→
example/42_groupnorm
_fwd
/common.hpp
View file @
261d3267
...
@@ -11,8 +11,8 @@
...
@@ -11,8 +11,8 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_
fwd_
impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_
fwd_
splitk_impl.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/fill.hpp"
...
...
example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp
→
example/42_groupnorm
_fwd
/groupnorm_
fwd_
sigmoid_mul_fp16.cpp
View file @
261d3267
...
@@ -37,29 +37,29 @@ struct YElementOp
...
@@ -37,29 +37,29 @@ struct YElementOp
};
};
using
DeviceInstance
=
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
ck
::
tensor_operation
::
device
::
DeviceNormalization
Fwd
Impl
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
ComputeDataType
,
ComputeDataType
,
YDataType
,
YDataType
,
SaveMeanInvStdDataType
,
SaveMeanInvStdDataType
,
YElementOp
,
YElementOp
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
1024
,
// BlockSize
1024
,
// BlockSize
1
,
// ClusterM
1
,
// ClusterM
1024
,
// ClusterK
1024
,
// ClusterK
1
,
// SliceM
1
,
// SliceM
32
,
// SliceK
32
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
1
,
// SrcVecDim (0=M, 1=K)
2
,
// SrcScalarPerVector
2
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
1
,
// GammaVecDim (0=M, 1=K)
2
,
// GammaScalarPerVector
2
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
1
,
// BetaVecDim (0=M, 1=K)
2
,
// BetaScalarPerVector
2
,
// BetaScalarPerVector
2
,
// YScalarPerVector
2
,
// YScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
#include "run_groupnorm_example.inc"
#include "run_groupnorm_
fwd_
example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
run_groupnorm_example
(
argc
,
argv
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
run_groupnorm_
fwd_
example
(
argc
,
argv
);
}
example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp
0 → 100644
View file @
261d3267
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
constexpr
int
Rank
=
5
;
constexpr
int
NumReduceDim
=
3
;
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
float
;
using
ComputeDataType
=
float
;
using
YElementOp
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
#define SAVE_MEAN_INV_STD
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationFwdSplitKImpl
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
YElementOp
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
1
,
// ClusterM
256
,
// ClusterK
1
,
// SliceM
16
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
2
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
2
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
2
,
// BetaScalarPerVector
2
,
// YScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
#include "run_groupnorm_fwd_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
run_groupnorm_fwd_example
(
argc
,
argv
);
}
example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp
0 → 100644
View file @
261d3267
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
constexpr
int
Rank
=
5
;
constexpr
int
NumReduceDim
=
3
;
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
float
;
using
ComputeDataType
=
float
;
using
YElementOp
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
#define SAVE_MEAN_INV_STD
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationFwdImpl
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
YElementOp
,
Rank
,
NumReduceDim
,
1024
,
// BlockSize
1
,
// ClusterM
1024
,
// ClusterK
1
,
// SliceM
32
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
2
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
2
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
2
,
// BetaScalarPerVector
2
,
// YScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
#include "run_groupnorm_fwd_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
run_groupnorm_fwd_example
(
argc
,
argv
);
}
example/42_groupnorm/run_groupnorm_example.inc
→
example/42_groupnorm
_fwd
/run_groupnorm_
fwd_
example.inc
View file @
261d3267
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#pragma once
#pragma once
int
run_groupnorm_example
(
int
argc
,
char
*
argv
[])
int
run_groupnorm_
fwd_
example
(
int
argc
,
char
*
argv
[])
{
{
ck
::
index_t
N
=
32
;
ck
::
index_t
N
=
32
;
ck
::
index_t
H
=
16
;
ck
::
index_t
H
=
16
;
...
@@ -65,9 +65,9 @@ int run_groupnorm_example(int argc, char* argv[])
...
@@ -65,9 +65,9 @@ int run_groupnorm_example(int argc, char* argv[])
{
0
,
0
,
0
,
C
,
1
},
{
0
,
0
,
0
,
C
,
1
},
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
()
.
begin
(),
y
.
mDesc
.
GetStrides
()
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
()
.
begin
(),
y
.
mDesc
.
GetStrides
()
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
{
1
,
2
,
4
},
// reduction dimension: [H, W, C]
{
1
,
2
,
4
},
// reduction dimension: [H, W, C]
1
e
-
6
,
1
e
-
6
,
x_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
...
...
example/44_elementwise_permute/CMakeLists.txt
View file @
261d3267
add_example_executable
(
example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp32_row elementwise_permute_4D_fp32_row.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_row elementwise_permute_4D_fp16_row.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp32_col elementwise_permute_4D_fp32_col.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp
)
add_example_executable
(
example_elementwise_permute elementwise_permute.cpp
)
add_example_executable
(
example_elementwise_permute_3d elementwise_permute_3d.cpp
)
example/44_elementwise_permute/elementwise_permute.cpp
0 → 100644
View file @
261d3267
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
7
8
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment