Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
aa5859e4
Commit
aa5859e4
authored
Aug 13, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into wavelet_model
parents
9bd6cc0e
5ee30459
Changes
280
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1129 additions
and
203 deletions
+1129
-203
CMakeLists.txt
CMakeLists.txt
+1
-8
Dockerfile
Dockerfile
+17
-15
Jenkinsfile
Jenkinsfile
+343
-112
README.md
README.md
+9
-1
client_example/01_gemm/CMakeLists.txt
client_example/01_gemm/CMakeLists.txt
+2
-0
client_example/01_gemm/gemm.cpp
client_example/01_gemm/gemm.cpp
+218
-0
client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
...xample/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+24
-20
client_example/03_gemm_layernorm/CMakeLists.txt
client_example/03_gemm_layernorm/CMakeLists.txt
+2
-2
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
+10
-9
client_example/04_contraction/CMakeLists.txt
client_example/04_contraction/CMakeLists.txt
+6
-0
client_example/04_contraction/contraction_bilinear.cpp
client_example/04_contraction/contraction_bilinear.cpp
+241
-0
client_example/04_contraction/contraction_scale.cpp
client_example/04_contraction/contraction_scale.cpp
+227
-0
client_example/CMakeLists.txt
client_example/CMakeLists.txt
+2
-0
client_example/README.md
client_example/README.md
+1
-12
cmake/googletest.cmake
cmake/googletest.cmake
+1
-0
example/01_gemm/CMakeLists.txt
example/01_gemm/CMakeLists.txt
+1
-0
example/01_gemm/gemm_dl_fp16.cpp
example/01_gemm/gemm_dl_fp16.cpp
+6
-6
example/01_gemm/gemm_dl_fp32.cpp
example/01_gemm/gemm_dl_fp32.cpp
+6
-6
example/01_gemm/gemm_dl_int8.cpp
example/01_gemm/gemm_dl_int8.cpp
+6
-6
example/01_gemm/gemm_xdl_bf16.cpp
example/01_gemm/gemm_xdl_bf16.cpp
+6
-6
No files found.
Too many changes to show.
To preserve performance only
280 of 280+
files are displayed.
Plain diff
Email patch
CMakeLists.txt
View file @
aa5859e4
...
...
@@ -8,7 +8,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
enable_testing
()
set
(
ROCM_SYMLINK_LIBS OFF
)
find_package
(
ROCM
0.8
REQUIRED PATHS /opt/rocm
)
find_package
(
ROCM REQUIRED PATHS /opt/rocm
)
include
(
ROCMInstallTargets
)
include
(
ROCMPackageConfigHelpers
)
...
...
@@ -71,13 +71,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
endif
()
message
(
STATUS
"Build with HIP
${
HIP_VERSION
}
"
)
rocm_create_package
(
NAME composablekernel
DESCRIPTION
"High Performance Composable Kernel for AMD GPUs"
MAINTAINER
"MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
LDCONFIG
)
## tidy
include
(
EnableCompilerWarnings
)
set
(
CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name
)
...
...
Dockerfile
View file @
aa5859e4
...
...
@@ -2,6 +2,7 @@ FROM ubuntu:18.04
ARG
ROCMVERSION=5.1
ARG
OSDB_BKC_VERSION
ARG
compiler_version
RUN
set
-xe
...
...
@@ -15,7 +16,6 @@ RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.l
RUN
wget
--no-check-certificate
-qO
- https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN
sh
-c
"echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
# ADD requirements.txt requirements.txt
# Install dependencies
RUN
apt-get update
&&
DEBIAN_FRONTEND
=
noninteractive apt-get
install
-y
--allow-unauthenticated
\
apt-utils
\
...
...
@@ -23,8 +23,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
cmake-data
=
3.15.1-0kitware1
\
cmake
=
3.15.1-0kitware1
\
curl
\
g++
\
gdb
\
git
\
hip-rocclr
\
jq
\
...
...
@@ -61,17 +59,7 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
RUN
wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
RUN
dpkg
-i
dumb-init_
*
.deb
&&
rm
dumb-init_
*
.deb
# Install cget
RUN
pip
install
cget
# Install rclone
RUN
pip
install
https://github.com/pfultz2/rclone/archive/master.tar.gz
ARG
PREFIX=/opt/rocm
# Install dependencies
RUN
cget
install
pfultz2/rocm-recipes
# Install rbuild
RUN
pip3
install
https://github.com/RadeonOpenCompute/rbuild/archive/6d78a0553babdaea8d2da5de15cbda7e869594b8.tar.gz
# Install packages for processing the performance results
RUN
pip3
install
--upgrade
pip
RUN
pip3
install
sqlalchemy
...
...
@@ -84,12 +72,26 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
ENV
LC_ALL=C.UTF-8
ENV
LANG=C.UTF-8
ADD
rbuild.ini /rbuild.ini
ADD
dev-requirements.txt dev-requirements.txt
RUN
rbuild prepare
-s
develop
-d
$PREFIX
RUN
groupadd
-f
render
# Install the new rocm-cmake version
RUN
git clone
-b
master https://github.com/RadeonOpenCompute/rocm-cmake.git
&&
\
cd
rocm-cmake
&&
mkdir
build
&&
cd
build
&&
\
cmake ..
&&
cmake
--build
.
&&
cmake
--build
.
--target
install
WORKDIR
/
ENV
compiler_version=$compiler_version
RUN
sh
-c
"echo compiler version = '
$compiler_version
'"
RUN
--mount
=
type
=
ssh
if
[
"
$compiler_version
"
!=
"release"
]
;
then
\
git clone
-b
"
$compiler_version
"
https://github.com/RadeonOpenCompute/llvm-project.git
&&
\
cd
llvm-project
&&
mkdir
build
&&
cd
build
&&
\
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/rocm/llvm
-DCMAKE_BUILD_TYPE
=
Release
-DLLVM_ENABLE_ASSERTIONS
=
1
-DLLVM_TARGETS_TO_BUILD
=
"AMDGPU;X86"
-DLLVM_ENABLE_PROJECTS
=
"clang;lld;compiler-rt"
../llvm
&&
\
make
-j
8
;
\
else
echo
"using the release compiler"
;
\
fi
#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
Jenkinsfile
View file @
aa5859e4
...
...
@@ -11,6 +11,96 @@ def show_node_info() {
"""
}
def
runShell
(
String
command
){
def
responseCode
=
sh
returnStatus:
true
,
script:
"${command} > tmp.txt"
def
output
=
readFile
(
file:
"tmp.txt"
)
echo
"tmp.txt contents: $output"
return
(
output
!=
""
)
}
def
getDockerImageName
(){
def
img
=
"${env.MIOPEN_IMAGE_URL}:composable_kernels_${params.COMPILER_VERSION}"
return
img
}
def
getDockerImage
(
Map
conf
=[:]){
env
.
DOCKER_BUILDKIT
=
1
def
prefixpath
=
conf
.
get
(
"prefixpath"
,
"/opt/rocm"
)
// prefix:/opt/rocm
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
// prebuilt dockers should have all the architectures enabled so one image can be used for all stages
def
no_cache
=
conf
.
get
(
"no_cache"
,
false
)
def
dockerArgs
=
"--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if
(
env
.
CCACHE_HOST
)
{
def
check_host
=
sh
(
script:
"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """
,
returnStdout:
true
).
trim
()
if
(
check_host
==
"+PONG"
)
{
echo
"FOUND CCACHE SERVER: ${CCACHE_HOST}"
}
else
{
echo
"CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
}
dockerArgs
=
dockerArgs
+
" --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
env
.
CCACHE_DIR
=
"""/tmp/ccache_store"""
env
.
CCACHE_SECONDARY_STORAGE
=
"""redis://${env.CCACHE_HOST}"""
}
if
(
no_cache
)
{
dockerArgs
=
dockerArgs
+
" --no-cache "
}
echo
"Docker Args: ${dockerArgs}"
def
image
=
getDockerImageName
()
//Check if image exists
def
retimage
try
{
echo
"Pulling down image: ${image}"
retimage
=
docker
.
image
(
"${image}"
)
retimage
.
pull
()
}
catch
(
Exception
ex
)
{
error
"Unable to locate image: ${image}"
}
return
[
retimage
,
image
]
}
def
buildDocker
(
install_prefix
){
show_node_info
()
env
.
DOCKER_BUILDKIT
=
1
checkout
scm
def
image_name
=
getDockerImageName
()
echo
"Building Docker for ${image_name}"
def
dockerArgs
=
"--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if
(
env
.
CCACHE_HOST
)
{
def
check_host
=
sh
(
script:
"""(printf "PING\\r\\n";) | nc -N ${env.CCACHE_HOST} 6379 """
,
returnStdout:
true
).
trim
()
if
(
check_host
==
"+PONG"
)
{
echo
"FOUND CCACHE SERVER: ${CCACHE_HOST}"
}
else
{
echo
"CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
}
dockerArgs
=
dockerArgs
+
" --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
env
.
CCACHE_DIR
=
"""/tmp/ccache_store"""
env
.
CCACHE_SECONDARY_STORAGE
=
"""redis://${env.CCACHE_HOST}"""
}
echo
"Build Args: ${dockerArgs}"
try
{
echo
"Checking for image: ${image_name}"
sh
"docker manifest inspect --insecure ${image_name}"
echo
"Image: ${image_name} found!! Skipping building image"
}
catch
(
Exception
ex
){
echo
"Unable to locate image: ${image_name}. Building image now"
retimage
=
docker
.
build
(
"${image_name}"
,
dockerArgs
+
' .'
)
retimage
.
push
()
}
}
def
cmake_build
(
Map
conf
=[:]){
def
compiler
=
conf
.
get
(
"compiler"
,
"/opt/rocm/bin/hipcc"
)
...
...
@@ -60,7 +150,7 @@ def cmake_build(Map conf=[:]){
"""
def
setup_cmd
=
conf
.
get
(
"setup_cmd"
,
"${cmake_envs} cmake ${setup_args} .. "
)
// reduce parallelism when compiling, clang uses too much memory
def
build_cmd
=
conf
.
get
(
"build_cmd"
,
"${build_envs} dumb-init make -j\$(( \$(nproc) /
1
)) ${config_targets}"
)
def
build_cmd
=
conf
.
get
(
"build_cmd"
,
"${build_envs} dumb-init make -j\$(( \$(nproc) /
2
)) ${config_targets}"
)
def
execute_cmd
=
conf
.
get
(
"execute_cmd"
,
""
)
def
cmd
=
conf
.
get
(
"cmd"
,
"""
...
...
@@ -85,7 +175,7 @@ def buildHipClangJob(Map conf=[:]){
env
.
HSA_ENABLE_SDMA
=
0
checkout
scm
def
image
=
"composable_kernels"
def
image
=
"composable_kernels
_${params.COMPILER_VERSION}
"
def
prefixpath
=
conf
.
get
(
"prefixpath"
,
"/opt/rocm"
)
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
...
...
@@ -93,22 +183,31 @@ def buildHipClangJob(Map conf=[:]){
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def
dockerOpts
=
"--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1"
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
}
//def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if
(
params
.
COMPILER_VERSION
!=
"release"
){
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
def
variant
=
env
.
STAGE_NAME
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
if
(
params
.
USE_DOCKERFILE
){
try
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
//retimage = docker.build("${image}", dockerArgs + '.')
(
retimage
,
image
)
=
getDockerImage
(
conf
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
throw
new
Exception
(
"GPU not found"
)
}
else
{
echo
"GPU is OK"
}
}
}
}
...
...
@@ -117,27 +216,23 @@ def buildHipClangJob(Map conf=[:]){
throw
e
}
catch
(
Exception
ex
)
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"--no-cache ."
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"
--no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
throw
new
Exception
(
"GPU not found"
)
}
else
{
echo
"GPU is OK"
}
}
}
else
{
timeout
(
time:
3
,
unit:
'HOURS'
){
retimage
=
docker
.
image
(
'compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54'
).
pull
()
image
=
"b56f8ac0d6ea"
sh
"docker images"
}
}
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
timeout
(
time:
5
,
unit:
'HOURS'
)
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
cmake_build
(
conf
)
}
}
...
...
@@ -149,10 +244,6 @@ def reboot(){
build
job:
'reboot-slaves'
,
propagate:
false
,
parameters:
[
string
(
name:
'server'
,
value:
"${env.NODE_NAME}"
),]
}
def
buildHipClangJobAndReboot
(
Map
conf
=[:]){
try
{
buildHipClangJob
(
conf
)
...
...
@@ -169,14 +260,14 @@ def buildHipClangJobAndReboot(Map conf=[:]){
}
}
def
runCKProfiler
(
Map
conf
=[:]){
show_node_info
()
env
.
HSA_ENABLE_SDMA
=
0
checkout
scm
def
image
=
"composable_kernels"
def
image
=
"composable_kernels_${params.COMPILER_VERSION}"
def
prefixpath
=
conf
.
get
(
"prefixpath"
,
"/opt/rocm"
)
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
...
...
@@ -184,22 +275,29 @@ def runCKProfiler(Map conf=[:]){
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def
dockerOpts
=
"--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1"
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if
(
params
.
COMPILER_VERSION
!=
"release"
){
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
def
variant
=
env
.
STAGE_NAME
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
if
(
params
.
USE_DOCKERFILE
){
try
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
//retimage = docker.build("${image}", dockerArgs + '.')
(
retimage
,
image
)
=
getDockerImage
(
conf
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
throw
new
Exception
(
"GPU not found"
)
}
else
{
echo
"GPU is OK"
}
}
}
}
...
...
@@ -208,74 +306,61 @@ def runCKProfiler(Map conf=[:]){
throw
e
}
catch
(
Exception
ex
)
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"--no-cache ."
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"
--no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
timeout
(
time:
5
,
unit:
'MINUTES'
){
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
throw
new
Exception
(
"GPU not found"
)
}
else
{
echo
"GPU is OK"
}
}
}
else
{
timeout
(
time:
3
,
unit:
'HOURS'
){
retimage
=
docker
.
image
(
'compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54'
).
pull
()
image
=
"b56f8ac0d6ea"
sh
"docker images"
}
}
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
timeout
(
time:
5
,
unit:
'HOURS'
)
timeout
(
time:
24
,
unit:
'HOURS'
)
{
cmake_build
(
conf
)
dir
(
"script"
){
//run gemm performance tests
def
gemm_log
=
"perf_gemm_${gpu_arch}.log"
sh
"rm -f ${gemm_log}"
sh
"echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
sh
"echo Node name: ${NODE_NAME} >> ${gemm_log}"
sh
"echo GPU_arch name: ${gpu_arch} >> ${gemm_log}"
sh
"rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
sh
"hipcc --version | grep -e 'HIP version' >> ${gemm_log}"
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
sh
"./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}"
sh
"./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}"
//results will be parsed, stored, and analyzed within the python script
//the script will return 0 if the performance criteria are met
//or return 1 if the criteria are not met
archiveArtifacts
"${gemm_log}"
sh
"python3 parse_perf_data.py ${gemm_log} "
//run resnet50 test
def
resnet_log
=
"perf_resnet50_${gpu_arch}.log"
sh
"rm -f ${resnet_log}"
sh
"echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}"
sh
"echo Node name: ${NODE_NAME} >> ${resnet_log}"
sh
"echo GPU_arch name: ${gpu_arch} >> ${resnet_log}"
sh
"rocminfo | grep 'Compute Unit:' >> ${resnet_log} "
sh
"hipcc --version | grep -e 'HIP version' >> ${resnet_log}"
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}"
//first run tests with N=256
sh
"./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}"
//then run with N=4
sh
"./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}"
archiveArtifacts
"${resnet_log}"
//the script will put the results from N=256 and N=4 runs into separate tables
sh
"python3 parse_perf_data.py ${resnet_log} "
if
(
params
.
RUN_FULL_QA
){
def
qa_log
=
"qa_${gpu_arch}.log"
sh
"./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts
"perf_gemm_${gpu_arch}.log"
archiveArtifacts
"perf_resnet50_N256_${gpu_arch}.log"
archiveArtifacts
"perf_resnet50_N4_${gpu_arch}.log"
archiveArtifacts
"perf_batched_gemm_${gpu_arch}.log"
archiveArtifacts
"perf_grouped_gemm_${gpu_arch}.log"
archiveArtifacts
"perf_conv_fwd_${gpu_arch}.log"
archiveArtifacts
"perf_conv_bwd_data_${gpu_arch}.log"
archiveArtifacts
"perf_gemm_bilinear_${gpu_arch}.log"
archiveArtifacts
"perf_reduction_${gpu_arch}.log"
// stash perf files to master
stash
name:
"perf_gemm_${gpu_arch}.log"
stash
name:
"perf_resnet50_N256_${gpu_arch}.log"
stash
name:
"perf_resnet50_N4_${gpu_arch}.log"
stash
name:
"perf_batched_gemm_${gpu_arch}.log"
stash
name:
"perf_grouped_gemm_${gpu_arch}.log"
stash
name:
"perf_conv_fwd_${gpu_arch}.log"
stash
name:
"perf_conv_bwd_data_${gpu_arch}.log"
stash
name:
"perf_gemm_bilinear_${gpu_arch}.log"
stash
name:
"perf_reduction_${gpu_arch}.log"
//we will process results on the master node
}
else
{
sh
"./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
archiveArtifacts
"perf_gemm_${gpu_arch}.log"
archiveArtifacts
"perf_resnet50_N256_${gpu_arch}.log"
archiveArtifacts
"perf_resnet50_N4_${gpu_arch}.log"
// stash perf files to master
stash
name:
"perf_gemm_${gpu_arch}.log"
stash
name:
"perf_resnet50_N256_${gpu_arch}.log"
stash
name:
"perf_resnet50_N4_${gpu_arch}.log"
//we will process the results on the master node
}
}
}
}
...
...
@@ -283,7 +368,6 @@ def runCKProfiler(Map conf=[:]){
return
retimage
}
def
runPerfTest
(
Map
conf
=[:]){
try
{
runCKProfiler
(
conf
)
...
...
@@ -300,16 +384,97 @@ def runPerfTest(Map conf=[:]){
}
}
def
process_results
(
Map
conf
=[:]){
env
.
HSA_ENABLE_SDMA
=
0
checkout
scm
def
image
=
"composable_kernels_${params.COMPILER_VERSION}"
def
prefixpath
=
"/opt/rocm"
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
// Jenkins is complaining about the render group
def
dockerOpts
=
"--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='release' "
def
variant
=
env
.
STAGE_NAME
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
try
{
//retimage = docker.build("${image}", dockerArgs + '.')
(
retimage
,
image
)
=
getDockerImage
(
conf
)
}
catch
(
org
.
jenkinsci
.
plugins
.
workflow
.
steps
.
FlowInterruptedException
e
){
echo
"The job was cancelled or aborted"
throw
e
}
}
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
timeout
(
time:
1
,
unit:
'HOURS'
){
try
{
dir
(
"script"
){
if
(
params
.
RUN_FULL_QA
){
// unstash perf files to master
unstash
"perf_gemm_${gpu_arch}.log"
unstash
"perf_resnet50_N256_${gpu_arch}.log"
unstash
"perf_resnet50_N4_${gpu_arch}.log"
unstash
"perf_batched_gemm_${gpu_arch}.log"
unstash
"perf_grouped_gemm_${gpu_arch}.log"
unstash
"perf_conv_fwd_${gpu_arch}.log"
unstash
"perf_conv_bwd_data_${gpu_arch}.log"
unstash
"perf_gemm_bilinear_${gpu_arch}.log"
unstash
"perf_reduction_${gpu_arch}.log"
sh
"./process_qa_data.sh ${gpu_arch}"
}
else
{
// unstash perf files to master
unstash
"perf_gemm_${gpu_arch}.log"
unstash
"perf_resnet50_N256_${gpu_arch}.log"
unstash
"perf_resnet50_N4_${gpu_arch}.log"
sh
"./process_perf_data.sh ${gpu_arch}"
}
}
}
catch
(
e
){
echo
"throwing error exception while processing performance test results"
echo
'Exception occurred: '
+
e
.
toString
()
throw
e
}
}
}
}
//launch develop branch daily at 23:00 in FULL_QA mode
CRON_SETTINGS
=
BRANCH_NAME
==
"develop"
?
'''0 23 * * * % RUN_FULL_QA=true'''
:
""
pipeline
{
agent
none
triggers
{
parameterizedCron
(
CRON_SETTINGS
)
}
options
{
parallelsAlwaysFailFast
()
}
parameters
{
booleanParam
(
name:
"
USE
_DOCKER
FILE
"
,
name:
"
BUILD
_DOCKER"
,
defaultValue:
true
,
description:
""
)
description:
"Force building docker image (default: true)"
)
string
(
name:
'COMPILER_VERSION'
,
defaultValue:
'ck-9110'
,
description:
'Specify which version of compiler to use: ck-9110 (default), release, or amd-stg-open.'
)
booleanParam
(
name:
"RUN_FULL_QA"
,
defaultValue:
false
,
description:
"Select whether to run small set of performance tests (default) or full QA"
)
booleanParam
(
name:
"TEST_NODE_PERFORMANCE"
,
defaultValue:
false
,
description:
"Test the node GPU performance (default: false)"
)
}
environment
{
dbuser
=
"${dbuser}"
...
...
@@ -319,9 +484,28 @@ pipeline {
dbsshuser
=
"${dbsshuser}"
dbsshpassword
=
"${dbsshpassword}"
status_wrapper_creds
=
"${status_wrapper_creds}"
gerrit_cred
=
"${gerrit_cred}"
DOCKER_BUILDKIT
=
"1"
}
stages
{
stage
(
"Build Docker"
){
when
{
expression
{
params
.
BUILD_DOCKER
.
toBoolean
()
}
}
parallel
{
stage
(
'Docker /opt/rocm'
){
agent
{
label
rocmnode
(
"nogpu"
)
}
steps
{
buildDocker
(
'/opt/rocm'
)
}
}
}
}
stage
(
"Static checks"
)
{
when
{
beforeAgent
true
expression
{
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
parallel
{
// enable after we move from hipcc to hip-clang
// stage('Tidy') {
...
...
@@ -355,6 +539,10 @@ pipeline {
}
stage
(
"Tests"
)
{
when
{
beforeAgent
true
expression
{
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
parallel
{
stage
(
"Run Tests: gfx908"
)
...
...
@@ -369,6 +557,10 @@ pipeline {
}
stage
(
"Run Tests: gfx90a"
)
{
when
{
beforeAgent
true
expression
{
params
.
RUN_FULL_QA
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx90a"
)}
environment
{
setup_args
=
""" -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
...
...
@@ -381,6 +573,10 @@ pipeline {
}
stage
(
"Client App"
)
{
when
{
beforeAgent
true
expression
{
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
parallel
{
stage
(
"Run Client App"
)
...
...
@@ -402,6 +598,10 @@ pipeline {
{
stage
(
"Run ckProfiler: gfx908"
)
{
when
{
beforeAgent
true
expression
{
!
params
.
RUN_FULL_QA
.
toBoolean
()
&&
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx908"
)}
environment
{
setup_args
=
""" -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
...
...
@@ -412,6 +612,10 @@ pipeline {
}
stage
(
"Run ckProfiler: gfx90a"
)
{
when
{
beforeAgent
true
expression
{
params
.
RUN_FULL_QA
.
toBoolean
()
||
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx90a"
)}
environment
{
setup_args
=
""" -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
...
...
@@ -422,6 +626,33 @@ pipeline {
}
}
}
stage
(
"Process Performance Test Results"
)
{
parallel
{
stage
(
"Process results for gfx908"
){
when
{
beforeAgent
true
expression
{
!
params
.
RUN_FULL_QA
.
toBoolean
()
&&
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
agent
{
label
'mici'
}
steps
{
process_results
(
gpu_arch:
"gfx908"
)
}
}
stage
(
"Process results for gfx90a"
){
when
{
beforeAgent
true
expression
{
params
.
RUN_FULL_QA
.
toBoolean
()
||
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
agent
{
label
'mici'
}
steps
{
process_results
(
gpu_arch:
"gfx90a"
)
}
}
}
}
/* enable after the cmake file supports packaging
stage("Packages") {
when {
...
...
README.md
View file @
aa5859e4
...
...
@@ -10,7 +10,7 @@ rocm/tensorflow:rocm5.1-tf2.6-dev \
/bin/bash
```
# Install
the new
rocm-cmake
version
# Install
newer version of
rocm-cmake
https://github.com/RadeonOpenCompute/rocm-cmake
## Build
...
...
@@ -26,6 +26,7 @@ cmake \
-D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx908 --offload-arch=gfx90a -O3"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_INSTALL_PREFIX
=
${
PATH_TO_CK_INSTALL_DIRECTORY
}
\
..
```
...
...
@@ -47,6 +48,13 @@ Instructions for running each individual examples are under ```example/```
```
Instructions for running ckProfiler are under
```profiler/```
## Install CK
```
bash
make
install
```
## Using CK as pre-built kernel library
Instructions for using CK as a pre-built kernel library are under
```client_example/```
## Caveat
### Kernel Timing and Verification
...
...
client_example/01_gemm/CMakeLists.txt
0 → 100644
View file @
aa5859e4
add_executable
(
client_gemm gemm.cpp
)
target_link_libraries
(
client_gemm PRIVATE composable_kernel::device_operations
)
client_example/01_gemm/gemm.cpp
0 → 100644
View file @
aa5859e4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
CDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
CLayout
=
Row
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// GEMM shape
ck
::
index_t
M
=
3840
;
ck
::
index_t
N
=
4096
;
ck
::
index_t
K
=
4096
;
ck
::
index_t
StrideA
=
4096
;
ck
::
index_t
StrideB
=
4096
;
ck
::
index_t
StrideC
=
4096
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
7
)
{
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
K
=
std
::
stoi
(
argv
[
3
]);
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideC
=
std
::
stoi
(
argv
[
6
]);
}
else
{
printf
(
"arg1 to 6: M, N, K, StrideA, StrideB, StrideC
\n
"
);
exit
(
0
);
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
(
std
::
is_same
<
Layout
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
ALayout
{}));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
BLayout
{}));
SimpleDeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideC
,
CLayout
{}));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
c_element_op
=
CElementOp
{};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
View file @
aa5859e4
...
...
@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/
device_
gemm_add_add_fastgelu
_instance
.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
@@ -27,7 +27,6 @@ using CDEElementOp = AddAddFastGelu;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
AccDataType
=
F32
;
using
D0DataType
=
F16
;
using
D1DataType
=
F16
;
using
EDataType
=
F16
;
...
...
@@ -111,19 +110,22 @@ int main(int argc, char* argv[])
f_matrix_space_size
(
M
,
N
,
StrideD1
,
D1Layout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
// add device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
get_device_gemm_add_add_fastgelu_instances
<
ADataType
,
BDataType
,
AccDataType
,
D0DataType
,
D1DataType
,
EDataType
,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
D0Layout
,
D1Layout
,
ELayout
>
();
ck
::
Tuple
<
D0Layout
,
D1Layout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
D0DataType
,
D1DataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
AddAddFastGelu
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
...
...
@@ -231,6 +233,8 @@ int main(int argc, char* argv[])
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
...
...
client_example/03_gemm_layernorm/CMakeLists.txt
View file @
aa5859e4
add_executable
(
gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp
)
target_link_libraries
(
gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations
)
add_executable
(
client_
gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp
)
target_link_libraries
(
client_
gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations
)
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
View file @
aa5859e4
...
...
@@ -160,8 +160,9 @@ int main()
ck
::
index_t
StrideC
=
1024
;
ck
::
index_t
StrideD0
=
1024
;
const
auto
gemm_reduce_ptrs
=
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
get_device_gemm_add_add_mean_squaremean_instances
<
ADataType
,
const
auto
gemm_reduce_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
get_device_gemm_add_add_mean_squaremean_instances
<
ADataType
,
BDataType
,
CDataType
,
ALayout
,
...
...
@@ -169,7 +170,7 @@ int main()
CLayout
>
();
const
auto
normalize_ptrs
=
ck
::
tensor_operation
::
device
::
get_device_normalize_from_mean_meansquare_instances
<
ck
::
tensor_operation
::
device
::
instance
::
get_device_normalize_from_mean_meansquare_instances
<
CDataType
,
ReduceDataType
,
ReduceDataType
,
...
...
client_example/04_contraction/CMakeLists.txt
0 → 100644
View file @
aa5859e4
add_executable
(
client_contraction_scale contraction_scale.cpp
)
target_link_libraries
(
client_contraction_scale PRIVATE composable_kernel::device_operations
)
add_executable
(
client_contraction_bilinear contraction_bilinear.cpp
)
target_link_libraries
(
client_contraction_bilinear PRIVATE composable_kernel::device_operations
)
client_example/04_contraction/contraction_bilinear.cpp
0 → 100644
View file @
aa5859e4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Bilinear
=
ck
::
tensor_operation
::
element_wise
::
Bilinear
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Bilinear
;
using
ADataType
=
F32
;
using
BDataType
=
F32
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F32
;
using
DDataType
=
F32
;
using
DsDataType
=
ck
::
Tuple
<
DDataType
>
;
using
EDataType
=
F32
;
static
constexpr
ck
::
index_t
NumDimM
=
2
;
static
constexpr
ck
::
index_t
NumDimN
=
2
;
static
constexpr
ck
::
index_t
NumDimK
=
2
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
524288
,
4096
,
128
,
1
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
524288
,
4096
,
128
,
1
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
float
alpha
=
1.
f
;
float
beta
=
1.
f
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
25
)
{
const
ck
::
index_t
M0
=
std
::
stoi
(
argv
[
1
]);
const
ck
::
index_t
M1
=
std
::
stoi
(
argv
[
2
]);
const
ck
::
index_t
N0
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
N1
=
std
::
stoi
(
argv
[
4
]);
const
ck
::
index_t
K0
=
std
::
stoi
(
argv
[
5
]);
const
ck
::
index_t
K1
=
std
::
stoi
(
argv
[
6
]);
a_ms_ks_lengths
=
{
M0
,
M1
,
K0
,
K1
};
a_ms_ks_strides
=
{
std
::
stoi
(
argv
[
7
]),
std
::
stoi
(
argv
[
8
]),
std
::
stoi
(
argv
[
9
]),
std
::
stoi
(
argv
[
10
])};
b_ns_ks_lengths
=
{
N0
,
N1
,
K0
,
K1
};
b_ns_ks_strides
=
{
std
::
stoi
(
argv
[
11
]),
std
::
stoi
(
argv
[
12
]),
std
::
stoi
(
argv
[
13
]),
std
::
stoi
(
argv
[
14
])};
d_ms_ns_lengths
=
{
M0
,
M1
,
N0
,
N1
};
d_ms_ns_strides
=
{
std
::
stoi
(
argv
[
15
]),
std
::
stoi
(
argv
[
16
]),
std
::
stoi
(
argv
[
17
]),
std
::
stoi
(
argv
[
18
])};
e_ms_ns_lengths
=
{
M0
,
M1
,
N0
,
N1
};
e_ms_ns_strides
=
{
std
::
stoi
(
argv
[
19
]),
std
::
stoi
(
argv
[
20
]),
std
::
stoi
(
argv
[
21
]),
std
::
stoi
(
argv
[
22
])};
alpha
=
std
::
stof
(
argv
[
23
]);
beta
=
std
::
stof
(
argv
[
24
]);
}
else
{
printf
(
"arg1 to 6: M0, M1, N0, N1, K0, K1
\n
"
);
printf
(
"arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1
\n
"
);
printf
(
"arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1
\n
"
);
printf
(
"arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1
\n
"
);
printf
(
"arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1
\n
"
);
printf
(
"arg23 to 24: alpha, beta
\n
"
);
exit
(
0
);
}
auto
f_tensor_space_size
=
[](
auto
lengths
,
auto
strides
)
{
std
::
size_t
space_size
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
lengths
.
size
();
++
i
)
{
space_size
+=
(
lengths
[
i
]
-
1
)
*
strides
[
i
];
}
return
space_size
;
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_tensor_space_size
(
a_ms_ks_lengths
,
a_ms_ks_strides
));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_tensor_space_size
(
b_ns_ks_lengths
,
b_ns_ks_strides
));
SimpleDeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
f_tensor_space_size
(
d_ms_ns_lengths
,
d_ms_ns_strides
));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_tensor_space_size
(
e_ms_ns_lengths
,
e_ms_ns_strides
));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
ck
::
Tuple
<
DDataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Bilinear
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{
alpha
,
beta
};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
a_ms_ks_lengths
,
a_ms_ks_strides
,
b_ns_ks_lengths
,
b_ns_ks_strides
,
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_ms_ns_lengths
},
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_ms_ns_strides
},
e_ms_ns_lengths
,
e_ms_ns_strides
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
ck
::
index_t
M
=
std
::
accumulate
(
e_ms_ns_lengths
.
begin
(),
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
ck
::
index_t
N
=
std
::
accumulate
(
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
e_ms_ns_lengths
.
begin
()
+
NumDimM
+
NumDimN
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
ck
::
index_t
K
=
std
::
accumulate
(
a_ms_ks_lengths
.
begin
()
+
NumDimM
,
a_ms_ks_lengths
.
begin
()
+
NumDimM
+
NumDimK
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
DDataType
)
*
M
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
0
;
}
client_example/04_contraction/contraction_scale.cpp
0 → 100644
View file @
aa5859e4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Scale
;
using
ADataType
=
F32
;
using
BDataType
=
F32
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F32
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
EDataType
=
F32
;
static
constexpr
ck
::
index_t
NumDimM
=
2
;
static
constexpr
ck
::
index_t
NumDimN
=
2
;
static
constexpr
ck
::
index_t
NumDimK
=
2
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
524288
,
4096
,
128
,
1
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
float
scale
=
1.
f
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
20
)
{
const
ck
::
index_t
M0
=
std
::
stoi
(
argv
[
1
]);
const
ck
::
index_t
M1
=
std
::
stoi
(
argv
[
2
]);
const
ck
::
index_t
N0
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
N1
=
std
::
stoi
(
argv
[
4
]);
const
ck
::
index_t
K0
=
std
::
stoi
(
argv
[
5
]);
const
ck
::
index_t
K1
=
std
::
stoi
(
argv
[
6
]);
a_ms_ks_lengths
=
{
M0
,
M1
,
K0
,
K1
};
a_ms_ks_strides
=
{
std
::
stoi
(
argv
[
7
]),
std
::
stoi
(
argv
[
8
]),
std
::
stoi
(
argv
[
9
]),
std
::
stoi
(
argv
[
10
])};
b_ns_ks_lengths
=
{
N0
,
N1
,
K0
,
K1
};
b_ns_ks_strides
=
{
std
::
stoi
(
argv
[
11
]),
std
::
stoi
(
argv
[
12
]),
std
::
stoi
(
argv
[
13
]),
std
::
stoi
(
argv
[
14
])};
e_ms_ns_lengths
=
{
M0
,
M1
,
N0
,
N1
};
e_ms_ns_strides
=
{
std
::
stoi
(
argv
[
15
]),
std
::
stoi
(
argv
[
16
]),
std
::
stoi
(
argv
[
17
]),
std
::
stoi
(
argv
[
18
])};
scale
=
std
::
stof
(
argv
[
19
]);
}
else
{
printf
(
"arg1 to 6: M0, M1, N0, N1, K0, K1
\n
"
);
printf
(
"arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1
\n
"
);
printf
(
"arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1
\n
"
);
printf
(
"arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1
\n
"
);
printf
(
"arg19: scale
\n
"
);
exit
(
0
);
}
auto
f_tensor_space_size
=
[](
auto
lengths
,
auto
strides
)
{
std
::
size_t
space_size
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
lengths
.
size
();
++
i
)
{
space_size
+=
(
lengths
[
i
]
-
1
)
*
strides
[
i
];
}
return
space_size
;
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_tensor_space_size
(
a_ms_ks_lengths
,
a_ms_ks_strides
));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_tensor_space_size
(
b_ns_ks_lengths
,
b_ns_ks_strides
));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_tensor_space_size
(
e_ms_ns_lengths
,
e_ms_ns_strides
));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
ck
::
Tuple
<>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Scale
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{
scale
};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
0
>
{},
e_device_buf
.
GetDeviceBuffer
(),
a_ms_ks_lengths
,
a_ms_ks_strides
,
b_ns_ks_lengths
,
b_ns_ks_strides
,
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
0
>
{},
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
0
>
{},
e_ms_ns_lengths
,
e_ms_ns_strides
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
ck
::
index_t
M
=
std
::
accumulate
(
e_ms_ns_lengths
.
begin
(),
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
ck
::
index_t
N
=
std
::
accumulate
(
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
e_ms_ns_lengths
.
begin
()
+
NumDimM
+
NumDimN
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
ck
::
index_t
K
=
std
::
accumulate
(
a_ms_ks_lengths
.
begin
()
+
NumDimM
,
a_ms_ks_lengths
.
begin
()
+
NumDimM
+
NumDimK
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
0
;
}
client_example/CMakeLists.txt
View file @
aa5859e4
...
...
@@ -6,5 +6,7 @@ find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
find_package
(
hip REQUIRED PATHS /opt/rocm
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
add_subdirectory
(
01_gemm
)
add_subdirectory
(
02_gemm_add_add_fastgelu
)
add_subdirectory
(
03_gemm_layernorm
)
add_subdirectory
(
04_contraction
)
client_example/README.md
View file @
aa5859e4
##
Client application links to CK library, and therefore CK library needs to be installed before building client applications.
## Docker script
```
bash
docker run
\
-it
\
--privileged
\
--group-add
sudo
\
-w
/root/workspace
\
-v
${
PATH_TO_LOCAL_WORKSPACE
}
:/root/workspace
\
rocm/tensorflow:rocm5.1-tf2.6-dev
\
/bin/bash
```
## Build
```
bash
...
...
@@ -22,7 +11,7 @@ cd client_example/build
```
bash
cmake
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_PREFIX_PATH
=
"
/opt/rocm
;
${
PATH_TO_CK_INSTALL_DIRECTORY
}
"
\
..
```
...
...
cmake/googletest.cmake
View file @
aa5859e4
...
...
@@ -20,6 +20,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
-Wno-unused-member-function
-Wno-comma
-Wno-old-style-cast
-Wno-deprecated
)
message
(
STATUS
"Suppressing googltest warnings with flags:
${
GTEST_CMAKE_CXX_FLAGS
}
"
)
...
...
example/01_gemm/CMakeLists.txt
View file @
aa5859e4
...
...
@@ -4,5 +4,6 @@ add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
add_example_executable
(
example_gemm_xdl_fp16 gemm_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_xdl_bf16 gemm_xdl_bf16.cpp
)
add_example_executable
(
example_gemm_xdl_int8 gemm_xdl_int8.cpp
)
add_example_executable
(
example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp
)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing
(
example_gemm_xdl_fp64 gemm_xdl_fp64.cpp
)
example/01_gemm/gemm_dl_fp16.cpp
View file @
aa5859e4
...
...
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/
host_tensor
/device_memory.hpp"
#include "ck/library/
host_tensor
/host_tensor.hpp"
#include "ck/library/
host_tensor
/host_tensor_generator.hpp"
#include "ck/library/
utility
/device_memory.hpp"
#include "ck/library/
utility
/host_tensor.hpp"
#include "ck/library/
utility
/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
...
...
@@ -142,9 +142,9 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
example/01_gemm/gemm_dl_fp32.cpp
View file @
aa5859e4
...
...
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/
host_tensor
/device_memory.hpp"
#include "ck/library/
host_tensor
/host_tensor.hpp"
#include "ck/library/
host_tensor
/host_tensor_generator.hpp"
#include "ck/library/
utility
/device_memory.hpp"
#include "ck/library/
utility
/host_tensor.hpp"
#include "ck/library/
utility
/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
...
...
@@ -141,9 +141,9 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
example/01_gemm/gemm_dl_int8.cpp
View file @
aa5859e4
...
...
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/
host_tensor
/device_memory.hpp"
#include "ck/library/
host_tensor
/host_tensor.hpp"
#include "ck/library/
host_tensor
/host_tensor_generator.hpp"
#include "ck/library/
utility
/device_memory.hpp"
#include "ck/library/
utility
/host_tensor.hpp"
#include "ck/library/
utility
/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
...
...
@@ -139,9 +139,9 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
example/01_gemm/gemm_xdl_bf16.cpp
View file @
aa5859e4
...
...
@@ -11,9 +11,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/library/
host_tensor
/device_memory.hpp"
#include "ck/library/
host_tensor
/host_tensor.hpp"
#include "ck/library/
host_tensor
/host_tensor_generator.hpp"
#include "ck/library/
utility
/device_memory.hpp"
#include "ck/library/
utility
/host_tensor.hpp"
#include "ck/library/
utility
/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
@@ -170,9 +170,9 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
Prev
1
2
3
4
5
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment