Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5903efe7
Unverified
Commit
5903efe7
authored
Nov 16, 2023
by
arai713
Committed by
GitHub
Nov 16, 2023
Browse files
Merge branch 'develop' into transpose_5d
parents
2100ea4b
e1fa0091
Changes
231
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
176 additions
and
110 deletions
+176
-110
client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
...vnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
+13
-0
client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
...vnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
+13
-0
client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
...vnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
+13
-0
client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
...vnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
+13
-0
docs/sphinx/requirements.txt
docs/sphinx/requirements.txt
+16
-2
example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+2
-2
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+2
-2
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+2
-2
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+2
-2
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+2
-2
example/27_layernorm/CMakeLists.txt
example/27_layernorm/CMakeLists.txt
+0
-2
example/27_layernorm/layernorm_fp16.cpp
example/27_layernorm/layernorm_fp16.cpp
+0
-44
example/27_layernorm/layernorm_splitk_fp16.cpp
example/27_layernorm/layernorm_splitk_fp16.cpp
+0
-45
example/27_layernorm2d_fwd/CMakeLists.txt
example/27_layernorm2d_fwd/CMakeLists.txt
+2
-0
example/27_layernorm2d_fwd/common.hpp
example/27_layernorm2d_fwd/common.hpp
+2
-2
example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp
example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp
+44
-0
example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp
example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp
+45
-0
example/27_layernorm2d_fwd/run_layernorm_example.inc
example/27_layernorm2d_fwd/run_layernorm_example.inc
+3
-3
example/30_grouped_conv_fwd_multiple_d/README.md
example/30_grouped_conv_fwd_multiple_d/README.md
+1
-1
example/30_grouped_conv_fwd_multiple_d/common.hpp
example/30_grouped_conv_fwd_multiple_d/common.hpp
+1
-1
No files found.
client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
0 → 100644
View file @
5903efe7
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
Tuple
<
ck
::
bhalf_t
,
ck
::
bhalf_t
>
;
using
WeiDataType
=
ck
::
Tuple
<
ck
::
bhalf_t
,
ck
::
bhalf_t
>
;
using
OutDataType
=
ck
::
bhalf_t
;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_ab
();
}
client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
0 → 100644
View file @
5903efe7
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
Tuple
<
ck
::
half_t
,
ck
::
half_t
>
;
using
WeiDataType
=
ck
::
Tuple
<
ck
::
half_t
,
ck
::
half_t
>
;
using
OutDataType
=
ck
::
half_t
;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_ab
();
}
client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
0 → 100644
View file @
5903efe7
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
Tuple
<
float
,
float
>
;
using
WeiDataType
=
ck
::
Tuple
<
float
,
float
>
;
using
OutDataType
=
float
;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_ab
();
}
client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
0 → 100644
View file @
5903efe7
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using
InDataType
=
ck
::
Tuple
<
int8_t
,
int8_t
>
;
using
WeiDataType
=
ck
::
Tuple
<
int8_t
,
int8_t
>
;
using
OutDataType
=
int8_t
;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int
main
()
{
return
execute_conv_fwd_scaleadd_ab
();
}
docs/sphinx/requirements.txt
View file @
5903efe7
...
@@ -48,6 +48,12 @@ idna==3.4
...
@@ -48,6 +48,12 @@ idna==3.4
# via requests
# via requests
imagesize==1.4.1
imagesize==1.4.1
# via sphinx
# via sphinx
importlib-metadata==6.8.0
# via
# sphinx
# sphinxcontrib-bibtex
importlib-resources==6.1.0
# via rocm-docs-core
jinja2==3.1.2
jinja2==3.1.2
# via
# via
# myst-parser
# myst-parser
...
@@ -90,9 +96,13 @@ pygments==2.14.0
...
@@ -90,9 +96,13 @@ pygments==2.14.0
# pydata-sphinx-theme
# pydata-sphinx-theme
# sphinx
# sphinx
pyjwt[crypto]==2.6.0
pyjwt[crypto]==2.6.0
# via pygithub
# via
# pygithub
# pyjwt
pynacl==1.5.0
pynacl==1.5.0
# via pygithub
# via pygithub
pytz==2023.3.post1
# via babel
pyyaml==6.0
pyyaml==6.0
# via
# via
# myst-parser
# myst-parser
...
@@ -103,7 +113,7 @@ requests==2.28.2
...
@@ -103,7 +113,7 @@ requests==2.28.2
# via
# via
# pygithub
# pygithub
# sphinx
# sphinx
rocm-docs-core==0.2
6
.0
rocm-docs-core==0.2
7
.0
# via -r requirements.in
# via -r requirements.in
six==1.16.0
six==1.16.0
# via
# via
...
@@ -157,3 +167,7 @@ urllib3==1.26.15
...
@@ -157,3 +167,7 @@ urllib3==1.26.15
# via requests
# via requests
wrapt==1.15.0
wrapt==1.15.0
# via deprecated
# via deprecated
zipp==3.17.0
# via
# importlib-metadata
# importlib-resources
example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
View file @
5903efe7
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
View file @
5903efe7
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
View file @
5903efe7
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
View file @
5903efe7
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
View file @
5903efe7
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
...
@@ -27,7 +27,7 @@ static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecializatio
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle
<
NDimSpatial
,
NDimSpatial
,
InLayout
,
InLayout
,
WeiLayout
,
WeiLayout
,
...
...
example/27_layernorm/CMakeLists.txt
deleted
100644 → 0
View file @
2100ea4b
add_example_executable
(
example_layernorm_fp16 layernorm_fp16.cpp
)
add_example_executable
(
example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp
)
example/27_layernorm/layernorm_fp16.cpp
deleted
100644 → 0
View file @
2100ea4b
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
float
;
using
ComputeDataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
#define SAVE_MEAN_INV_STD
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
PassThrough
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
8
,
// SliceK
1
,
// XYVectorDim (0=M, 1=K)
8
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
8
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
8
,
// BetaScalarPerVector
8
,
// YScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
#include "run_layernorm_example.inc"
int
main
()
{
return
run_groupnorm_example
<
DeviceInstance
>
();
}
example/27_layernorm/layernorm_splitk_fp16.cpp
deleted
100644 → 0
View file @
2100ea4b
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
float
;
using
ComputeDataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
#define SAVE_MEAN_INV_STD
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationSplitKImpl
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
PassThrough
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
8
,
// SliceK
1
,
// XYVectorDim (0=M, 1=K)
8
,
// XScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
8
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
8
,
// BetaScalarPerVector
8
,
// YScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
#include "run_layernorm_example.inc"
int
main
()
{
return
run_groupnorm_example
<
DeviceInstance
>
();
}
example/27_layernorm2d_fwd/CMakeLists.txt
0 → 100644
View file @
5903efe7
add_example_executable
(
example_layernorm2d_fwd_fp16 layernorm2d_fwd_fp16.cpp
)
add_example_executable
(
example_layernorm2d_fwd_splitk_fp16 layernorm2d_fwd_splitk_fp16.cpp
)
example/27_layernorm/common.hpp
→
example/27_layernorm
2d_fwd
/common.hpp
View file @
5903efe7
...
@@ -10,8 +10,8 @@
...
@@ -10,8 +10,8 @@
#include <getopt.h>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_
fwd_
impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_
fwd_
splitk_impl.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
example/27_layernorm2d_fwd/layernorm2d_fwd_fp16.cpp
0 → 100644
View file @
5903efe7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
float
;
using
ComputeDataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
#define SAVE_MEAN_INV_STD
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationFwdImpl
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
PassThrough
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
8
,
// SliceK
1
,
// XYVectorDim (0=M, 1=K)
8
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
8
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
8
,
// BetaScalarPerVector
8
,
// YScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
#include "run_layernorm_example.inc"
int
main
()
{
return
run_layernorm2d_fwd_example
<
DeviceInstance
>
();
}
example/27_layernorm2d_fwd/layernorm2d_fwd_splitk_fp16.cpp
0 → 100644
View file @
5903efe7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
SaveMeanInvStdDataType
=
float
;
using
ComputeDataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
#define SAVE_MEAN_INV_STD
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationFwdSplitKImpl
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
PassThrough
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
8
,
// SliceK
1
,
// XYVectorDim (0=M, 1=K)
8
,
// XScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
8
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
8
,
// BetaScalarPerVector
8
,
// YScalarPerVector
1
>
;
// SaveMeanInvStdScalarPerVector
#include "run_layernorm_example.inc"
int
main
()
{
return
run_layernorm2d_fwd_example
<
DeviceInstance
>
();
}
example/27_layernorm/run_layernorm_example.inc
→
example/27_layernorm
2d_fwd
/run_layernorm_example.inc
View file @
5903efe7
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#pragma once
#pragma once
template
<
typename
DeviceInstance
>
template
<
typename
DeviceInstance
>
int
run_
groupnorm
_example
()
int
run_
layernorm2d_fwd
_example
()
{
{
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
...
@@ -44,9 +44,9 @@ int run_groupnorm_example()
...
@@ -44,9 +44,9 @@ int run_groupnorm_example()
{
0
,
1
},
{
0
,
1
},
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
()
.
begin
(),
y
.
mDesc
.
GetStrides
()
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
()
.
begin
(),
y
.
mDesc
.
GetStrides
()
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
std
::
vector
<
ck
::
index_t
>
{
save_mean
.
mDesc
.
GetStrides
()
.
begin
(),
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
save_mean
.
mDesc
.
GetStrides
()
.
end
()},
{
1
},
{
1
},
1
e
-
4
,
1
e
-
4
,
x_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
...
...
example/30_grouped_conv_fwd_multiple_d/README.md
View file @
5903efe7
...
@@ -26,5 +26,5 @@ out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
...
@@ -26,5 +26,5 @@ out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Warm up 1 time
Start running 10 times...
Start running 10 times...
Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default>
Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultiple
AB
D_Xdl_CShuffle<256, 128, 256, 16, Default>
```
```
example/30_grouped_conv_fwd_multiple_d/common.hpp
View file @
5903efe7
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_
ab
d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
...
...
Prev
1
2
3
4
5
6
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment