Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
cd0c1f57
Unverified
Commit
cd0c1f57
authored
Apr 19, 2023
by
turneram
Committed by
GitHub
Apr 19, 2023
Browse files
Merge branch 'develop' into migx-device-interface
parents
c72a0d3e
bb0b772d
Changes
122
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1187 additions
and
132 deletions
+1187
-132
.github/dependabot.yml
.github/dependabot.yml
+12
-0
.gitignore
.gitignore
+8
-3
.readthedocs.yaml
.readthedocs.yaml
+18
-0
CMakeLists.txt
CMakeLists.txt
+7
-0
Dockerfile
Dockerfile
+24
-13
Jenkinsfile
Jenkinsfile
+21
-10
README.md
README.md
+12
-2
client_example/04_contraction/CMakeLists.txt
client_example/04_contraction/CMakeLists.txt
+10
-4
client_example/04_contraction/contraction_bilinear_fp32.cpp
client_example/04_contraction/contraction_bilinear_fp32.cpp
+0
-0
client_example/04_contraction/contraction_bilinear_fp64.cpp
client_example/04_contraction/contraction_bilinear_fp64.cpp
+281
-0
client_example/04_contraction/contraction_scale_fp32.cpp
client_example/04_contraction/contraction_scale_fp32.cpp
+0
-0
client_example/04_contraction/contraction_scale_fp64.cpp
client_example/04_contraction/contraction_scale_fp64.cpp
+270
-0
client_example/09_quantization/CMakeLists.txt
client_example/09_quantization/CMakeLists.txt
+6
-0
client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
...tization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
+2
-2
client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
...antization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
+51
-48
client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
...tization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
+209
-0
client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
...antization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
+201
-0
client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
...le/09_quantization/conv2d_fwd_perchannel_quantization.cpp
+2
-2
client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
...mple/09_quantization/conv2d_fwd_perlayer_quantization.cpp
+51
-48
client_example/18_groupnorm/CMakeLists.txt
client_example/18_groupnorm/CMakeLists.txt
+2
-0
No files found.
.github/dependabot.yml
0 → 100644
View file @
cd0c1f57
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version
:
2
updates
:
-
package-ecosystem
:
"
pip"
# See documentation for possible values
directory
:
"
/"
# Location of package manifests
open-pull-requests-limit
:
10
schedule
:
interval
:
"
daily"
.gitignore
View file @
cd0c1f57
...
@@ -48,6 +48,11 @@ build*
...
@@ -48,6 +48,11 @@ build*
.gdb_history
.gdb_history
install.dir*
install.dir*
# directories containing generated documentation
# documentation artifacts
docs/source/_build/
build/
docs/docBin/
_build/
_images/
_static/
_templates/
_toc.yml
docBin/
.readthedocs.yaml
0 → 100644
View file @
cd0c1f57
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version
:
2
build
:
os
:
ubuntu-22.04
tools
:
python
:
"
3.8"
sphinx
:
configuration
:
docs/conf.py
formats
:
[
htmlzip
]
python
:
install
:
-
requirements
:
docs/.sphinx/requirements.txt
CMakeLists.txt
View file @
cd0c1f57
...
@@ -22,6 +22,7 @@ include(TargetFlags)
...
@@ -22,6 +22,7 @@ include(TargetFlags)
list
(
APPEND CMAKE_PREFIX_PATH
${
CMAKE_INSTALL_PREFIX
}
${
CMAKE_INSTALL_PREFIX
}
/llvm
${
CMAKE_INSTALL_PREFIX
}
/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip
)
list
(
APPEND CMAKE_PREFIX_PATH
${
CMAKE_INSTALL_PREFIX
}
${
CMAKE_INSTALL_PREFIX
}
/llvm
${
CMAKE_INSTALL_PREFIX
}
/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip
)
option
(
USE_BITINT_EXTENSION_INT4,
"Whether to enable clang's BitInt extension to provide int4 data type."
OFF
)
option
(
USE_BITINT_EXTENSION_INT4,
"Whether to enable clang's BitInt extension to provide int4 data type."
OFF
)
option
(
USE_OPT_NAVI3X,
"Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons."
OFF
)
if
(
USE_BITINT_EXTENSION_INT4
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_compile_definitions
(
CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
)
add_compile_definitions
(
CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
)
...
@@ -29,6 +30,12 @@ if(USE_BITINT_EXTENSION_INT4)
...
@@ -29,6 +30,12 @@ if(USE_BITINT_EXTENSION_INT4)
message
(
"CK compiled with USE_BITINT_EXTENSION_INT4 set to
${
USE_BITINT_EXTENSION_INT4
}
"
)
message
(
"CK compiled with USE_BITINT_EXTENSION_INT4 set to
${
USE_BITINT_EXTENSION_INT4
}
"
)
endif
()
endif
()
if
(
USE_OPT_NAVI3X
)
add_compile_options
(
-mcumode
)
add_compile_options
(
-mno-wavefrontsize64
)
message
(
"CK compiled with USE_OPT_NAVI3X set to
${
USE_OPT_NAVI3X
}
"
)
endif
()
## Threads
## Threads
set
(
THREADS_PREFER_PTHREAD_FLAG ON
)
set
(
THREADS_PREFER_PTHREAD_FLAG ON
)
find_package
(
Threads REQUIRED
)
find_package
(
Threads REQUIRED
)
...
...
Dockerfile
View file @
cd0c1f57
FROM
ubuntu:20.04
FROM
ubuntu:20.04
ARG
ROCMVERSION=5.3
ARG
ROCMVERSION=5.
4.
3
ARG
compiler_version="release"
ARG
compiler_version="release"
ARG
compiler_commit=""
ARG
compiler_commit=""
...
@@ -8,23 +8,27 @@ RUN set -xe
...
@@ -8,23 +8,27 @@ RUN set -xe
ARG
DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
ARG
DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
RUN
useradd
-rm
-d
/home/jenkins
-s
/bin/bash
-u
1004 jenkins
RUN
useradd
-rm
-d
/home/jenkins
-s
/bin/bash
-u
1004 jenkins
RUN
useradd
-rm
-d
/home/manitera
-s
/bin/bash
-u
1002 manitera
# Add rocm repository
# Add rocm repository
RUN
apt-get update
RUN
apt-get update
RUN
apt-get
install
-y
wget gnupg
RUN
apt-get
install
-y
wget gnupg curl
RUN
wget
-qO
- http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
RUN
--mount
=
type
=
ssh
if
[
"
$ROCMVERSION
"
!=
"5.5"
]
;
then
\
wget
-qO
- http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
;
\
else
sh
-c
"wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb"
&&
\
apt update
&&
apt-get
install
-y
./amd-nonfree-radeon_20.04-1_all.deb
&&
\
sh
-c
'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.5 rel-50 > /etc/apt/sources.list.d/rocm-build.list'
&&
\
amdgpu-repo
--amdgpu-build
=
1558725
&&
DEBIAN_FRONTEND
=
noninteractive amdgpu-install
-y
--usecase
=
rocm
;
\
fi
RUN
sh
-c
"echo deb [arch=amd64]
$DEB_ROCM_REPO
ubuntu main > /etc/apt/sources.list.d/rocm.list"
RUN
sh
-c
"echo deb [arch=amd64]
$DEB_ROCM_REPO
ubuntu main > /etc/apt/sources.list.d/rocm.list"
RUN
wget
--no-check-certificate
-qO
- https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN
wget
--no-check-certificate
-qO
- https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN
sh
-c
"echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
RUN
sh
-c
"echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
RUN
curl
-fsSL
https://repo.radeon.com/rocm/rocm.gpg.key | gpg
--dearmor
-o
/etc/apt/trusted.gpg.d/rocm-keyring.gpg
# Install dependencies
# Install dependencies
RUN
apt-get update
&&
DEBIAN_FRONTEND
=
noninteractive apt-get
install
-y
--allow-unauthenticated
\
RUN
apt-get update
&&
DEBIAN_FRONTEND
=
noninteractive apt-get
install
-y
--allow-unauthenticated
\
apt-utils
\
apt-utils
\
build-essential
\
build-essential
\
ccache
\
ccache
\
cmake-data
\
cmake
\
cmake
\
curl
\
git
\
git
\
hip-rocclr
\
hip-rocclr
\
jq
\
jq
\
...
@@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
...
@@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
rocm-device-libs
\
rocm-device-libs
\
rocm-cmake
\
rocm-cmake
\
vim
\
vim
\
nano
\
zlib1g-dev
\
zlib1g-dev
\
openssh-server
\
openssh-server
\
clang-format-10
\
clang-format-10
\
...
@@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
...
@@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
apt-get clean
&&
\
apt-get clean
&&
\
rm
-rf
/var/lib/apt/lists/
*
rm
-rf
/var/lib/apt/lists/
*
#Install latest version of cmake
RUN
apt purge
--auto-remove
-y
cmake
RUN
apt update
RUN
apt
install
-y
software-properties-common lsb-release
RUN
apt clean all
RUN
wget
-O
- https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg
--dearmor
- |
tee
/etc/apt/trusted.gpg.d/kitware.gpg
>
/dev/null
RUN
apt-add-repository
"deb https://apt.kitware.com/ubuntu/
$(
lsb_release
-cs
)
main"
RUN
apt
install
-y
kitware-archive-keyring
RUN
rm
/etc/apt/trusted.gpg.d/kitware.gpg
RUN
apt
install
-y
cmake
# Setup ubsan environment to printstacktrace
# Setup ubsan environment to printstacktrace
RUN
ln
-s
/usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
RUN
ln
-s
/usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
ENV
UBSAN_OPTIONS=print_stacktrace=1
ENV
UBSAN_OPTIONS=print_stacktrace=1
...
@@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit
...
@@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit
RUN
sh
-c
"echo compiler version = '
$compiler_version
'"
RUN
sh
-c
"echo compiler version = '
$compiler_version
'"
RUN
sh
-c
"echo compiler commit = '
$compiler_commit
'"
RUN
sh
-c
"echo compiler commit = '
$compiler_commit
'"
RUN
--mount
=
type
=
ssh
if
[
"
$compiler_version
"
=
"amd-stg-open"
]
;
then
\
RUN
--mount
=
type
=
ssh
if
[
"
$compiler_version
"
!=
"release"
]
&&
[
"
$compiler_version
"
!=
~ ^
"rc"
]
&&
[
"
$compiler_commit
"
=
""
]
;
then
\
sed
-i
'/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);'
/opt/rocm/hip/bin/hipcc.pl
&&
\
sed
-i
'/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);'
/opt/rocm/bin/hipcc.pl
;
\
fi
RUN
--mount
=
type
=
ssh
if
[
"
$compiler_version
"
!=
"release"
]
&&
[
"
$compiler_commit
"
=
""
]
;
then
\
git clone
-b
"
$compiler_version
"
https://github.com/RadeonOpenCompute/llvm-project.git
&&
\
git clone
-b
"
$compiler_version
"
https://github.com/RadeonOpenCompute/llvm-project.git
&&
\
cd
llvm-project
&&
mkdir
build
&&
cd
build
&&
\
cd
llvm-project
&&
mkdir
build
&&
cd
build
&&
\
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/rocm/llvm
-DCMAKE_BUILD_TYPE
=
Release
-DLLVM_ENABLE_ASSERTIONS
=
1
-DLLVM_TARGETS_TO_BUILD
=
"AMDGPU;X86"
-DLLVM_ENABLE_PROJECTS
=
"clang;lld;compiler-rt"
../llvm
&&
\
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/rocm/llvm
-DCMAKE_BUILD_TYPE
=
Release
-DLLVM_ENABLE_ASSERTIONS
=
1
-DLLVM_TARGETS_TO_BUILD
=
"AMDGPU;X86"
-DLLVM_ENABLE_PROJECTS
=
"clang;lld;compiler-rt"
../llvm
&&
\
...
@@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com
...
@@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com
else
echo
"using the release compiler"
;
\
else
echo
"using the release compiler"
;
\
fi
fi
RUN
--mount
=
type
=
ssh
if
[
"
$compiler_version
"
!=
"release"
]
&&
[
"
$compiler_commit
"
!=
""
]
;
then
\
RUN
--mount
=
type
=
ssh
if
[
"
$compiler_version
"
!=
"release"
]
&&
[
"
$compiler_version
"
!=
~ ^
"rc"
]
&&
[
"
$compiler_commit
"
!=
""
]
;
then
\
git clone
-b
"
$compiler_version
"
https://github.com/RadeonOpenCompute/llvm-project.git
&&
\
git clone
-b
"
$compiler_version
"
https://github.com/RadeonOpenCompute/llvm-project.git
&&
\
cd
llvm-project
&&
git checkout
"
$compiler_commit
"
&&
echo
"checking out commit
$compiler_commit
"
&&
mkdir
build
&&
cd
build
&&
\
cd
llvm-project
&&
git checkout
"
$compiler_commit
"
&&
echo
"checking out commit
$compiler_commit
"
&&
mkdir
build
&&
cd
build
&&
\
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/rocm/llvm
-DCMAKE_BUILD_TYPE
=
Release
-DLLVM_ENABLE_ASSERTIONS
=
1
-DLLVM_TARGETS_TO_BUILD
=
"AMDGPU;X86"
-DLLVM_ENABLE_PROJECTS
=
"clang;lld;compiler-rt"
../llvm
&&
\
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/rocm/llvm
-DCMAKE_BUILD_TYPE
=
Release
-DLLVM_ENABLE_ASSERTIONS
=
1
-DLLVM_TARGETS_TO_BUILD
=
"AMDGPU;X86"
-DLLVM_ENABLE_PROJECTS
=
"clang;lld;compiler-rt"
../llvm
&&
\
...
...
Jenkinsfile
View file @
cd0c1f57
...
@@ -19,12 +19,23 @@ def runShell(String command){
...
@@ -19,12 +19,23 @@ def runShell(String command){
def
getDockerImageName
(){
def
getDockerImageName
(){
def
img
def
img
if
(
params
.
COMPILER_COMMIT
==
""
){
if
(
params
.
ROCMVERSION
!=
"5.5"
){
img
=
"${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
if
(
params
.
COMPILER_COMMIT
==
""
){
img
=
"${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
}
else
{
def
commit
=
"${params.COMPILER_COMMIT}"
[
0
..
6
]
img
=
"${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
}
}
}
else
{
else
{
def
commit
=
"${params.COMPILER_COMMIT}"
[
0
..
6
]
if
(
params
.
COMPILER_COMMIT
==
""
){
img
=
"${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
img
=
"${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
}
else
{
def
commit
=
"${params.COMPILER_COMMIT}"
[
0
..
6
]
img
=
"${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
}
}
}
return
img
return
img
}
}
...
@@ -49,11 +60,11 @@ def build_compiler(){
...
@@ -49,11 +60,11 @@ def build_compiler(){
compiler
=
'/opt/rocm/bin/hipcc'
compiler
=
'/opt/rocm/bin/hipcc'
}
}
else
{
else
{
if
(
params
.
COMPILER_VERSION
==
"
release
"
){
if
(
params
.
COMPILER_VERSION
==
"
amd-stg-open"
||
params
.
COMPILER_COMMIT
!=
"
"
){
compiler
=
"/
opt/rocm/llvm
/bin/clang++"
compiler
=
"/
llvm-project/build
/bin/clang++"
}
}
else
{
else
{
compiler
=
"/
llvm-project/build
/bin/clang++"
compiler
=
"/
opt/rocm/llvm
/bin/clang++"
}
}
}
}
return
compiler
return
compiler
...
@@ -232,7 +243,7 @@ def buildHipClangJob(Map conf=[:]){
...
@@ -232,7 +243,7 @@ def buildHipClangJob(Map conf=[:]){
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 "
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 "
}
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if
(
params
.
COMPILER_VERSION
!
=
"
release
"
){
if
(
params
.
COMPILER_VERSION
=
=
"
amd-stg-open"
||
params
.
COMPILER_COMMIT
!=
"
"
){
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
}
...
@@ -287,7 +298,7 @@ def runCKProfiler(Map conf=[:]){
...
@@ -287,7 +298,7 @@ def runCKProfiler(Map conf=[:]){
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 "
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 "
}
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if
(
params
.
COMPILER_VERSION
!
=
"
release
"
){
if
(
params
.
COMPILER_VERSION
=
=
"
amd-stg-open"
||
params
.
COMPILER_COMMIT
!=
"
"
){
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
}
...
@@ -420,7 +431,7 @@ def Build_CK(Map conf=[:]){
...
@@ -420,7 +431,7 @@ def Build_CK(Map conf=[:]){
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 "
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 "
}
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
if
(
params
.
COMPILER_VERSION
!
=
"
release
"
){
if
(
params
.
COMPILER_VERSION
=
=
"
amd-stg-open"
||
params
.
COMPILER_COMMIT
!=
"
"
){
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
}
...
...
README.md
View file @
cd0c1f57
...
@@ -7,7 +7,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi
...
@@ -7,7 +7,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi
*
A tile-based programming model
*
A tile-based programming model
*
Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
*
Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".


## Code Structure
## Code Structure
Current CK library are structured into 4 layers:
Current CK library are structured into 4 layers:
...
@@ -16,7 +16,17 @@ Current CK library are structured into 4 layers:
...
@@ -16,7 +16,17 @@ Current CK library are structured into 4 layers:
*
"Instantiated Kernel and Invoker" layer
*
"Instantiated Kernel and Invoker" layer
*
"Client API" layer
*
"Client API" layer


## Documentation
Run the steps below to build documentation locally.
```
cd docs
pip3 install -r .sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
## Contributors
## Contributors
The list of developers and contributors is here:
[
Contributors
](
/CONTRIBUTORS.md
)
The list of developers and contributors is here:
[
Contributors
](
/CONTRIBUTORS.md
)
...
...
client_example/04_contraction/CMakeLists.txt
View file @
cd0c1f57
add_executable
(
client_contraction_scale contraction_scale.cpp
)
add_executable
(
client_contraction_scale
_fp32
contraction_scale
_fp32
.cpp
)
target_link_libraries
(
client_contraction_scale PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_contraction_scale
_fp32
PRIVATE composable_kernel::device_operations
)
add_executable
(
client_contraction_bilinear contraction_bilinear.cpp
)
add_executable
(
client_contraction_bilinear_fp32 contraction_bilinear_fp32.cpp
)
target_link_libraries
(
client_contraction_bilinear PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_contraction_bilinear_fp32 PRIVATE composable_kernel::device_operations
)
add_executable
(
client_contraction_scale_fp64 contraction_scale_fp64.cpp
)
target_link_libraries
(
client_contraction_scale_fp64 PRIVATE composable_kernel::device_operations
)
add_executable
(
client_contraction_bilinear_fp64 contraction_bilinear_fp64.cpp
)
target_link_libraries
(
client_contraction_bilinear_fp64 PRIVATE composable_kernel::device_operations
)
add_executable
(
contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp
)
add_executable
(
contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp
)
target_link_libraries
(
contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations
)
...
...
client_example/04_contraction/contraction_bilinear.cpp
→
client_example/04_contraction/contraction_bilinear
_fp32
.cpp
View file @
cd0c1f57
File moved
client_example/04_contraction/contraction_bilinear_fp64.cpp
0 → 100644
View file @
cd0c1f57
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
#include "ck/library/utility/numeric.hpp"
using
F64
=
double
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Bilinear
=
ck
::
tensor_operation
::
element_wise
::
Bilinear
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Bilinear
;
using
ADataType
=
F64
;
using
BDataType
=
F64
;
using
AccDataType
=
F64
;
using
CShuffleDataType
=
F64
;
using
DDataType
=
F64
;
using
DsDataType
=
ck
::
Tuple
<
DDataType
>
;
using
EDataType
=
F64
;
static
constexpr
ck
::
index_t
NumDimM
=
2
;
static
constexpr
ck
::
index_t
NumDimN
=
2
;
static
constexpr
ck
::
index_t
NumDimK
=
2
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// kknn
#if 1
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
524288
,
4096
,
128
,
1
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
524288
,
4096
,
128
,
1
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// knnn
#elif 0
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
524288
,
4096
,
128
,
1
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
64
,
1
,
131072
,
2048
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// mknn
#elif 0
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
128
,
1
,
245760
,
3840
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
524288
,
4096
,
128
,
1
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// mnnn
#elif 0
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
128
,
1
,
245760
,
3840
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
64
,
1
,
131072
,
2048
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
#endif
float
alpha
=
1.
f
;
float
beta
=
1.
f
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
25
)
{
const
ck
::
index_t
M0
=
std
::
stoi
(
argv
[
1
]);
const
ck
::
index_t
M1
=
std
::
stoi
(
argv
[
2
]);
const
ck
::
index_t
N0
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
N1
=
std
::
stoi
(
argv
[
4
]);
const
ck
::
index_t
K0
=
std
::
stoi
(
argv
[
5
]);
const
ck
::
index_t
K1
=
std
::
stoi
(
argv
[
6
]);
a_ms_ks_lengths
=
{
M0
,
M1
,
K0
,
K1
};
a_ms_ks_strides
=
{
std
::
stoi
(
argv
[
7
]),
std
::
stoi
(
argv
[
8
]),
std
::
stoi
(
argv
[
9
]),
std
::
stoi
(
argv
[
10
])};
b_ns_ks_lengths
=
{
N0
,
N1
,
K0
,
K1
};
b_ns_ks_strides
=
{
std
::
stoi
(
argv
[
11
]),
std
::
stoi
(
argv
[
12
]),
std
::
stoi
(
argv
[
13
]),
std
::
stoi
(
argv
[
14
])};
d_ms_ns_lengths
=
{
M0
,
M1
,
N0
,
N1
};
d_ms_ns_strides
=
{
std
::
stoi
(
argv
[
15
]),
std
::
stoi
(
argv
[
16
]),
std
::
stoi
(
argv
[
17
]),
std
::
stoi
(
argv
[
18
])};
e_ms_ns_lengths
=
{
M0
,
M1
,
N0
,
N1
};
e_ms_ns_strides
=
{
std
::
stoi
(
argv
[
19
]),
std
::
stoi
(
argv
[
20
]),
std
::
stoi
(
argv
[
21
]),
std
::
stoi
(
argv
[
22
])};
alpha
=
std
::
stof
(
argv
[
23
]);
beta
=
std
::
stof
(
argv
[
24
]);
}
else
{
printf
(
"arg1 to 6: M0, M1, N0, N1, K0, K1
\n
"
);
printf
(
"arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1
\n
"
);
printf
(
"arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1
\n
"
);
printf
(
"arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1
\n
"
);
printf
(
"arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1
\n
"
);
printf
(
"arg23 to 24: alpha, beta
\n
"
);
exit
(
0
);
}
auto
f_tensor_space_size
=
[](
auto
lengths
,
auto
strides
)
{
std
::
size_t
space_size
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
lengths
.
size
();
++
i
)
{
space_size
+=
(
lengths
[
i
]
-
1
)
*
strides
[
i
];
}
return
space_size
;
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_tensor_space_size
(
a_ms_ks_lengths
,
a_ms_ks_strides
));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_tensor_space_size
(
b_ns_ks_lengths
,
b_ns_ks_strides
));
SimpleDeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
f_tensor_space_size
(
d_ms_ns_lengths
,
d_ms_ns_strides
));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_tensor_space_size
(
e_ms_ns_lengths
,
e_ms_ns_strides
));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
ck
::
Tuple
<
DDataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Bilinear
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{
alpha
,
beta
};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
a_ms_ks_lengths
,
a_ms_ks_strides
,
b_ns_ks_lengths
,
b_ns_ks_strides
,
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_ms_ns_lengths
},
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_ms_ns_strides
},
e_ms_ns_lengths
,
e_ms_ns_strides
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
ck
::
index_t
M
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_ms_ns_lengths
.
begin
(),
NumDimM
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
N
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
NumDimN
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
K
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
a_ms_ks_lengths
.
begin
()
+
NumDimM
,
NumDimK
,
1
,
std
::
multiplies
<>
{});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
DDataType
)
*
M
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
0
;
}
client_example/04_contraction/contraction_scale.cpp
→
client_example/04_contraction/contraction_scale
_fp32
.cpp
View file @
cd0c1f57
File moved
client_example/04_contraction/contraction_scale_fp64.cpp
0 → 100644
View file @
cd0c1f57
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
#include "ck/library/utility/numeric.hpp"
using
F64
=
double
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Scale
;
using
ADataType
=
F64
;
using
BDataType
=
F64
;
using
AccDataType
=
F64
;
using
CShuffleDataType
=
F64
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
EDataType
=
F64
;
static
constexpr
ck
::
index_t
NumDimM
=
2
;
static
constexpr
ck
::
index_t
NumDimN
=
2
;
static
constexpr
ck
::
index_t
NumDimK
=
2
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// kkn
#if 1
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
524288
,
4096
,
128
,
1
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
524288
,
4096
,
128
,
1
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// knn
#elif 0
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
524288
,
4096
,
128
,
1
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
64
,
1
,
131072
,
2048
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// mkn
#elif 0
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
128
,
1
,
245760
,
3840
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
524288
,
4096
,
128
,
1
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// mnn
#elif 0
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
128
,
1
,
245760
,
3840
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
64
,
1
,
131072
,
2048
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
#endif
float
scale
=
1.
f
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
20
)
{
const
ck
::
index_t
M0
=
std
::
stoi
(
argv
[
1
]);
const
ck
::
index_t
M1
=
std
::
stoi
(
argv
[
2
]);
const
ck
::
index_t
N0
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
N1
=
std
::
stoi
(
argv
[
4
]);
const
ck
::
index_t
K0
=
std
::
stoi
(
argv
[
5
]);
const
ck
::
index_t
K1
=
std
::
stoi
(
argv
[
6
]);
a_ms_ks_lengths
=
{
M0
,
M1
,
K0
,
K1
};
a_ms_ks_strides
=
{
std
::
stoi
(
argv
[
7
]),
std
::
stoi
(
argv
[
8
]),
std
::
stoi
(
argv
[
9
]),
std
::
stoi
(
argv
[
10
])};
b_ns_ks_lengths
=
{
N0
,
N1
,
K0
,
K1
};
b_ns_ks_strides
=
{
std
::
stoi
(
argv
[
11
]),
std
::
stoi
(
argv
[
12
]),
std
::
stoi
(
argv
[
13
]),
std
::
stoi
(
argv
[
14
])};
e_ms_ns_lengths
=
{
M0
,
M1
,
N0
,
N1
};
e_ms_ns_strides
=
{
std
::
stoi
(
argv
[
15
]),
std
::
stoi
(
argv
[
16
]),
std
::
stoi
(
argv
[
17
]),
std
::
stoi
(
argv
[
18
])};
scale
=
std
::
stof
(
argv
[
19
]);
}
else
{
printf
(
"arg1 to 6: M0, M1, N0, N1, K0, K1
\n
"
);
printf
(
"arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1
\n
"
);
printf
(
"arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1
\n
"
);
printf
(
"arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1
\n
"
);
printf
(
"arg19: scale
\n
"
);
exit
(
0
);
}
auto
f_tensor_space_size
=
[](
auto
lengths
,
auto
strides
)
{
std
::
size_t
space_size
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
lengths
.
size
();
++
i
)
{
space_size
+=
(
lengths
[
i
]
-
1
)
*
strides
[
i
];
}
return
space_size
;
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_tensor_space_size
(
a_ms_ks_lengths
,
a_ms_ks_strides
));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_tensor_space_size
(
b_ns_ks_lengths
,
b_ns_ks_strides
));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_tensor_space_size
(
e_ms_ns_lengths
,
e_ms_ns_strides
));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
ck
::
Tuple
<>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Scale
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{
scale
};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
0
>
{},
e_device_buf
.
GetDeviceBuffer
(),
a_ms_ks_lengths
,
a_ms_ks_strides
,
b_ns_ks_lengths
,
b_ns_ks_strides
,
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
0
>
{},
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
0
>
{},
e_ms_ns_lengths
,
e_ms_ns_strides
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
ck
::
index_t
M
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_ms_ns_lengths
.
begin
(),
NumDimM
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
N
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
NumDimN
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
K
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
a_ms_ks_lengths
.
begin
()
+
NumDimM
,
NumDimK
,
1
,
std
::
multiplies
<>
{});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
0
;
}
client_example/09_quantization/CMakeLists.txt
View file @
cd0c1f57
add_executable
(
client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp
)
target_link_libraries
(
client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations
)
add_executable
(
client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp
)
add_executable
(
client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp
)
target_link_libraries
(
client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations
)
add_executable
(
client_conv2d_fwd_bias_tanh_perlayer_quantization conv2d_fwd_bias_tanh_perlayer_quantization.cpp
)
target_link_libraries
(
client_conv2d_fwd_bias_tanh_perlayer_quantization PRIVATE composable_kernel::device_operations
)
add_executable
(
client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp
)
add_executable
(
client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp
)
target_link_libraries
(
client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations
)
...
...
client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
View file @
cd0c1f57
...
@@ -73,7 +73,7 @@ int main(int argc, char* argv[])
...
@@ -73,7 +73,7 @@ int main(int argc, char* argv[])
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
);
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
bias
(
sizeof
(
BiasDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
bias
(
sizeof
(
BiasDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
requant_scale
(
sizeof
(
RequantScaleDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
requant_scale
(
sizeof
(
RequantScaleDataType
)
*
K
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD
<
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD
<
...
@@ -203,4 +203,4 @@ int main(int argc, char* argv[])
...
@@ -203,4 +203,4 @@ int main(int argc, char* argv[])
}
}
return
0
;
return
0
;
}
}
\ No newline at end of file
client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
View file @
cd0c1f57
...
@@ -26,15 +26,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clam
...
@@ -26,15 +26,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clam
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
N
=
4
;
// batch size
static
constexpr
ck
::
index_t
N
=
4
;
// batch size
static
constexpr
ck
::
index_t
K
=
64
;
// output channel
static
constexpr
ck
::
index_t
K
=
64
;
// output channel
static
constexpr
ck
::
index_t
C
=
192
;
// input channel
static
constexpr
ck
::
index_t
C
=
192
;
// input channel
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
Hi
=
71
;
// input H
static
constexpr
ck
::
index_t
Hi
=
71
;
// input H
static
constexpr
ck
::
index_t
Wi
=
71
;
// input W
static
constexpr
ck
::
index_t
Wi
=
71
;
// input W
static
constexpr
ck
::
index_t
Ho
=
36
;
// output H
static
constexpr
ck
::
index_t
Ho
=
36
;
// output H
static
constexpr
ck
::
index_t
Wo
=
36
;
// output W
static
constexpr
ck
::
index_t
Wo
=
36
;
// output W
static
constexpr
float
requant_scale
=
0.5
f
;
// requantize qAcc to qz
struct
SimpleDeviceMem
struct
SimpleDeviceMem
{
{
...
@@ -102,26 +103,27 @@ int main(int argc, char* argv[])
...
@@ -102,26 +103,27 @@ int main(int argc, char* argv[])
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
auto
argument_ptr
=
wei
.
GetDeviceBuffer
(),
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
{
bias
.
GetDeviceBuffer
()},
wei
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
{
bias
.
GetDeviceBuffer
()},
in_lengths
,
out
.
GetDeviceBuffer
(),
in_strides
,
in_lengths
,
weight_lengths
,
in_strides
,
weight_strides
,
weight_lengths
,
{
bias_lengths
},
weight_strides
,
{
bias_strides
},
{
bias_lengths
},
out_lengths
,
{
bias_strides
},
out_strides
,
out_lengths
,
conv_strides
,
out_strides
,
conv_dilations
,
conv_strides
,
in_left_pad
,
conv_dilations
,
in_right_pad
,
in_left_pad
,
PassThrough
{},
in_right_pad
,
PassThrough
{},
PassThrough
{},
OutElementOp
{
0.5
f
,
ActivationOp
{}});
PassThrough
{},
OutElementOp
{
requant_scale
,
ActivationOp
{}});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
...
@@ -165,25 +167,26 @@ int main(int argc, char* argv[])
...
@@ -165,25 +167,26 @@ int main(int argc, char* argv[])
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
auto
argument_ptr
=
wei
.
GetDeviceBuffer
(),
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
{
bias
.
GetDeviceBuffer
()},
wei
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
{
bias
.
GetDeviceBuffer
()},
in_lengths
,
out
.
GetDeviceBuffer
(),
in_strides
,
in_lengths
,
weight_lengths
,
in_strides
,
weight_strides
,
weight_lengths
,
{
bias_lengths
},
weight_strides
,
{
bias_strides
},
{
bias_lengths
},
out_lengths
,
{
bias_strides
},
out_strides
,
out_lengths
,
conv_strides
,
out_strides
,
conv_dilations
,
conv_strides
,
in_left_pad
,
conv_dilations
,
in_right_pad
,
in_left_pad
,
PassThrough
{},
in_right_pad
,
PassThrough
{},
PassThrough
{},
OutElementOp
{
0.5
f
,
ActivationOp
{}});
PassThrough
{},
OutElementOp
{
requant_scale
,
ActivationOp
{}});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
...
client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp
0 → 100644
View file @
cd0c1f57
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <iostream>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
BiasDataType
=
int32_t
;
using
RequantScaleDataType
=
float
;
using
OutDataType
=
int8_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
BiasLayout
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
RequantScaleLayout
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWK
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ActivationOp
=
ck
::
tensor_operation
::
element_wise
::
TanH
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
Add_Mul2_Activation_Mul_Clamp
<
ActivationOp
>
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
N
=
4
;
// batch size
static
constexpr
ck
::
index_t
K
=
64
;
// output channel
static
constexpr
ck
::
index_t
C
=
192
;
// input channel
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
Hi
=
71
;
// input H
static
constexpr
ck
::
index_t
Wi
=
71
;
// input W
static
constexpr
ck
::
index_t
Ho
=
36
;
// output H
static
constexpr
ck
::
index_t
Wo
=
36
;
// output W
static
constexpr
float
sz_inv
=
0.5
f
;
// inverse of scale_z
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
std
::
array
<
ck
::
index_t
,
5
>
in_lengths
{
G
,
N
,
C
,
Hi
,
Wi
};
std
::
array
<
ck
::
index_t
,
5
>
in_strides
{
N
*
Hi
*
Wi
*
C
,
Hi
*
Wi
*
C
,
1
,
Wi
*
C
,
C
};
std
::
array
<
ck
::
index_t
,
5
>
weight_lengths
{
G
,
K
,
C
,
Y
,
X
};
std
::
array
<
ck
::
index_t
,
5
>
weight_strides
{
K
*
Y
*
X
*
C
,
Y
*
X
*
C
,
1
,
X
*
C
,
C
};
std
::
array
<
ck
::
index_t
,
5
>
bias_lengths
{
G
,
N
,
K
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
5
>
bias_strides
{
K
,
0
,
1
,
0
,
0
};
std
::
array
<
ck
::
index_t
,
5
>
requant_scale_lengths
{
G
,
N
,
K
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
5
>
requant_scale_strides
{
K
,
0
,
1
,
0
,
0
};
std
::
array
<
ck
::
index_t
,
5
>
out_lengths
{
G
,
N
,
K
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
5
>
out_strides
{
N
*
Ho
*
Wo
*
K
,
Ho
*
Wo
*
K
,
1
,
Wo
*
K
,
K
};
std
::
array
<
ck
::
index_t
,
2
>
in_left_pad
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
2
>
in_right_pad
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
2
>
conv_strides
{
2
,
2
};
std
::
array
<
ck
::
index_t
,
2
>
conv_dilations
{
1
,
1
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
bias
(
sizeof
(
BiasDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
requant_scale
(
sizeof
(
RequantScaleDataType
)
*
K
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
BiasLayout
,
RequantScaleLayout
>
,
OutLayout
,
InDataType
,
WeiDataType
,
ck
::
Tuple
<
BiasDataType
,
RequantScaleDataType
>
,
OutDataType
,
PassThrough
,
PassThrough
,
OutElementOp
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
int
best_op_id
=
-
1
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
float
best_tflops
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
{
bias
.
GetDeviceBuffer
(),
requant_scale
.
GetDeviceBuffer
()},
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
weight_lengths
,
weight_strides
,
{
bias_lengths
,
requant_scale_lengths
},
{
bias_strides
,
requant_scale_strides
},
out_lengths
,
out_strides
,
conv_strides
,
conv_dilations
,
in_left_pad
,
in_right_pad
,
PassThrough
{},
PassThrough
{},
OutElementOp
{
sz_inv
,
ActivationOp
{}});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
G
*
2
*
N
*
K
*
C
*
Ho
*
Wo
*
Y
*
X
;
std
::
size_t
num_bytes
=
G
*
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
+
G
*
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
+
G
*
sizeof
(
BiasDataType
)
*
K
+
G
*
sizeof
(
RequantScaleDataType
)
*
K
+
G
*
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_id
=
i
;
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
best_tflops
=
tflops
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
// run the best intance
if
(
best_op_id
!=
-
1
)
{
std
::
cout
<<
"Best Perf: "
<<
std
::
setw
(
10
)
<<
best_avg_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
{
bias
.
GetDeviceBuffer
(),
requant_scale
.
GetDeviceBuffer
()},
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
weight_lengths
,
weight_strides
,
{
bias_lengths
,
requant_scale_lengths
},
{
bias_strides
,
requant_scale_strides
},
out_lengths
,
out_strides
,
conv_strides
,
conv_dilations
,
in_left_pad
,
in_right_pad
,
PassThrough
{},
PassThrough
{},
OutElementOp
{
sz_inv
,
ActivationOp
{}});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp
0 → 100644
View file @
cd0c1f57
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <iostream>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
BiasDataType
=
int32_t
;
using
OutDataType
=
int8_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKYXC
;
using
BiasLayout
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
GNHWK
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ActivationOp
=
ck
::
tensor_operation
::
element_wise
::
TanH
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
Add_Mul_Activation_Mul_Clamp
<
ActivationOp
>
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
N
=
4
;
// batch size
static
constexpr
ck
::
index_t
K
=
64
;
// output channel
static
constexpr
ck
::
index_t
C
=
192
;
// input channel
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
Hi
=
71
;
// input H
static
constexpr
ck
::
index_t
Wi
=
71
;
// input W
static
constexpr
ck
::
index_t
Ho
=
36
;
// output H
static
constexpr
ck
::
index_t
Wo
=
36
;
// output W
static
constexpr
float
sacc
=
0.5
f
;
// scale of acc
static
constexpr
float
sz_inv
=
0.5
f
;
// inverse of scale_z
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
std
::
array
<
ck
::
index_t
,
5
>
in_lengths
{
G
,
N
,
C
,
Hi
,
Wi
};
std
::
array
<
ck
::
index_t
,
5
>
in_strides
{
N
*
Hi
*
Wi
*
C
,
Hi
*
Wi
*
C
,
1
,
Wi
*
C
,
C
};
std
::
array
<
ck
::
index_t
,
5
>
weight_lengths
{
G
,
K
,
C
,
Y
,
X
};
std
::
array
<
ck
::
index_t
,
5
>
weight_strides
{
K
*
Y
*
X
*
C
,
Y
*
X
*
C
,
1
,
X
*
C
,
C
};
std
::
array
<
ck
::
index_t
,
5
>
bias_lengths
{
G
,
N
,
K
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
5
>
bias_strides
{
K
,
0
,
1
,
0
,
0
};
std
::
array
<
ck
::
index_t
,
5
>
out_lengths
{
G
,
N
,
K
,
Ho
,
Wo
};
std
::
array
<
ck
::
index_t
,
5
>
out_strides
{
N
*
Ho
*
Wo
*
K
,
Ho
*
Wo
*
K
,
1
,
Wo
*
K
,
K
};
std
::
array
<
ck
::
index_t
,
2
>
in_left_pad
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
2
>
in_right_pad
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
2
>
conv_strides
{
2
,
2
};
std
::
array
<
ck
::
index_t
,
2
>
conv_dilations
{
1
,
1
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
bias
(
sizeof
(
BiasDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
BiasLayout
>
,
OutLayout
,
InDataType
,
WeiDataType
,
ck
::
Tuple
<
BiasDataType
>
,
OutDataType
,
PassThrough
,
PassThrough
,
OutElementOp
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
int
best_op_id
=
-
1
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
float
best_tflops
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
{
bias
.
GetDeviceBuffer
()},
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
weight_lengths
,
weight_strides
,
{
bias_lengths
},
{
bias_strides
},
out_lengths
,
out_strides
,
conv_strides
,
conv_dilations
,
in_left_pad
,
in_right_pad
,
PassThrough
{},
PassThrough
{},
OutElementOp
{
sacc
,
sz_inv
,
ActivationOp
{}});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
G
*
2
*
N
*
K
*
C
*
Ho
*
Wo
*
Y
*
X
;
std
::
size_t
num_bytes
=
G
*
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
+
G
*
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
+
G
*
sizeof
(
BiasDataType
)
*
K
+
G
*
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_id
=
i
;
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
best_tflops
=
tflops
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
// run the best intance
if
(
best_op_id
!=
-
1
)
{
std
::
cout
<<
"Best Perf: "
<<
std
::
setw
(
10
)
<<
best_avg_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
{
bias
.
GetDeviceBuffer
()},
out
.
GetDeviceBuffer
(),
in_lengths
,
in_strides
,
weight_lengths
,
weight_strides
,
{
bias_lengths
},
{
bias_strides
},
out_lengths
,
out_strides
,
conv_strides
,
conv_dilations
,
in_left_pad
,
in_right_pad
,
PassThrough
{},
PassThrough
{},
OutElementOp
{
sacc
,
sz_inv
,
ActivationOp
{}});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
\ No newline at end of file
client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
View file @
cd0c1f57
...
@@ -69,7 +69,7 @@ int main(int argc, char* argv[])
...
@@ -69,7 +69,7 @@ int main(int argc, char* argv[])
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
);
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
requant_scale
(
sizeof
(
RequantScaleDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
requant_scale
(
sizeof
(
RequantScaleDataType
)
*
K
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
);
using
DeviceOp
=
using
DeviceOp
=
...
@@ -196,4 +196,4 @@ int main(int argc, char* argv[])
...
@@ -196,4 +196,4 @@ int main(int argc, char* argv[])
}
}
return
0
;
return
0
;
}
}
\ No newline at end of file
client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
View file @
cd0c1f57
...
@@ -24,15 +24,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Ac
...
@@ -24,15 +24,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Ac
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
G
=
1
;
static
constexpr
ck
::
index_t
N
=
4
;
// batch size
static
constexpr
ck
::
index_t
N
=
4
;
// batch size
static
constexpr
ck
::
index_t
K
=
64
;
// output channel
static
constexpr
ck
::
index_t
K
=
64
;
// output channel
static
constexpr
ck
::
index_t
C
=
192
;
// input channel
static
constexpr
ck
::
index_t
C
=
192
;
// input channel
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
Y
=
3
;
// filter H
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
X
=
3
;
// filter W
static
constexpr
ck
::
index_t
Hi
=
71
;
// input H
static
constexpr
ck
::
index_t
Hi
=
71
;
// input H
static
constexpr
ck
::
index_t
Wi
=
71
;
// input W
static
constexpr
ck
::
index_t
Wi
=
71
;
// input W
static
constexpr
ck
::
index_t
Ho
=
36
;
// output H
static
constexpr
ck
::
index_t
Ho
=
36
;
// output H
static
constexpr
ck
::
index_t
Wo
=
36
;
// output W
static
constexpr
ck
::
index_t
Wo
=
36
;
// output W
static
constexpr
float
requant_scale
=
0.5
f
;
// requantize qAcc to qY
struct
SimpleDeviceMem
struct
SimpleDeviceMem
{
{
...
@@ -96,26 +97,27 @@ int main(int argc, char* argv[])
...
@@ -96,26 +97,27 @@ int main(int argc, char* argv[])
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
auto
argument_ptr
=
wei
.
GetDeviceBuffer
(),
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
{},
wei
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
{},
in_lengths
,
out
.
GetDeviceBuffer
(),
in_strides
,
in_lengths
,
weight_lengths
,
in_strides
,
weight_strides
,
weight_lengths
,
{},
weight_strides
,
{},
{},
out_lengths
,
{},
out_strides
,
out_lengths
,
conv_strides
,
out_strides
,
conv_dilations
,
conv_strides
,
in_left_pad
,
conv_dilations
,
in_right_pad
,
in_left_pad
,
PassThrough
{},
in_right_pad
,
PassThrough
{},
PassThrough
{},
OutElementOp
{
0.5
f
,
ActivationOp
{}});
PassThrough
{},
OutElementOp
{
requant_scale
,
ActivationOp
{}});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
...
@@ -158,25 +160,26 @@ int main(int argc, char* argv[])
...
@@ -158,25 +160,26 @@ int main(int argc, char* argv[])
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
auto
argument_ptr
=
wei
.
GetDeviceBuffer
(),
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
{},
wei
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
{},
in_lengths
,
out
.
GetDeviceBuffer
(),
in_strides
,
in_lengths
,
weight_lengths
,
in_strides
,
weight_strides
,
weight_lengths
,
{},
weight_strides
,
{},
{},
out_lengths
,
{},
out_strides
,
out_lengths
,
conv_strides
,
out_strides
,
conv_dilations
,
conv_strides
,
in_left_pad
,
conv_dilations
,
in_right_pad
,
in_left_pad
,
PassThrough
{},
in_right_pad
,
PassThrough
{},
PassThrough
{},
OutElementOp
{
0.5
f
,
ActivationOp
{}});
PassThrough
{},
OutElementOp
{
requant_scale
,
ActivationOp
{}});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
...
client_example/18_groupnorm/CMakeLists.txt
0 → 100644
View file @
cd0c1f57
add_executable
(
client_groupnorm_swish groupnorm_swish.cpp
)
target_link_libraries
(
client_groupnorm_swish PRIVATE composable_kernel::device_operations
)
Prev
1
2
3
4
5
…
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment