Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
aa5859e4
Commit
aa5859e4
authored
Aug 13, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into wavelet_model
parents
9bd6cc0e
5ee30459
Changes
280
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1129 additions
and
203 deletions
+1129
-203
CMakeLists.txt
CMakeLists.txt
+1
-8
Dockerfile
Dockerfile
+17
-15
Jenkinsfile
Jenkinsfile
+343
-112
README.md
README.md
+9
-1
client_example/01_gemm/CMakeLists.txt
client_example/01_gemm/CMakeLists.txt
+2
-0
client_example/01_gemm/gemm.cpp
client_example/01_gemm/gemm.cpp
+218
-0
client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
...xample/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+24
-20
client_example/03_gemm_layernorm/CMakeLists.txt
client_example/03_gemm_layernorm/CMakeLists.txt
+2
-2
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
+10
-9
client_example/04_contraction/CMakeLists.txt
client_example/04_contraction/CMakeLists.txt
+6
-0
client_example/04_contraction/contraction_bilinear.cpp
client_example/04_contraction/contraction_bilinear.cpp
+241
-0
client_example/04_contraction/contraction_scale.cpp
client_example/04_contraction/contraction_scale.cpp
+227
-0
client_example/CMakeLists.txt
client_example/CMakeLists.txt
+2
-0
client_example/README.md
client_example/README.md
+1
-12
cmake/googletest.cmake
cmake/googletest.cmake
+1
-0
example/01_gemm/CMakeLists.txt
example/01_gemm/CMakeLists.txt
+1
-0
example/01_gemm/gemm_dl_fp16.cpp
example/01_gemm/gemm_dl_fp16.cpp
+6
-6
example/01_gemm/gemm_dl_fp32.cpp
example/01_gemm/gemm_dl_fp32.cpp
+6
-6
example/01_gemm/gemm_dl_int8.cpp
example/01_gemm/gemm_dl_int8.cpp
+6
-6
example/01_gemm/gemm_xdl_bf16.cpp
example/01_gemm/gemm_xdl_bf16.cpp
+6
-6
No files found.
Too many changes to show.
To preserve performance only
280 of 280+
files are displayed.
Plain diff
Email patch
CMakeLists.txt
View file @
aa5859e4
...
@@ -8,7 +8,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
...
@@ -8,7 +8,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
enable_testing
()
enable_testing
()
set
(
ROCM_SYMLINK_LIBS OFF
)
set
(
ROCM_SYMLINK_LIBS OFF
)
find_package
(
ROCM
0.8
REQUIRED PATHS /opt/rocm
)
find_package
(
ROCM REQUIRED PATHS /opt/rocm
)
include
(
ROCMInstallTargets
)
include
(
ROCMInstallTargets
)
include
(
ROCMPackageConfigHelpers
)
include
(
ROCMPackageConfigHelpers
)
...
@@ -71,13 +71,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
...
@@ -71,13 +71,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
endif
()
endif
()
message
(
STATUS
"Build with HIP
${
HIP_VERSION
}
"
)
message
(
STATUS
"Build with HIP
${
HIP_VERSION
}
"
)
rocm_create_package
(
NAME composablekernel
DESCRIPTION
"High Performance Composable Kernel for AMD GPUs"
MAINTAINER
"MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
LDCONFIG
)
## tidy
## tidy
include
(
EnableCompilerWarnings
)
include
(
EnableCompilerWarnings
)
set
(
CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name
)
set
(
CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name
)
...
...
Dockerfile
View file @
aa5859e4
...
@@ -2,6 +2,7 @@ FROM ubuntu:18.04
...
@@ -2,6 +2,7 @@ FROM ubuntu:18.04
ARG
ROCMVERSION=5.1
ARG
ROCMVERSION=5.1
ARG
OSDB_BKC_VERSION
ARG
OSDB_BKC_VERSION
ARG
compiler_version
RUN
set
-xe
RUN
set
-xe
...
@@ -15,7 +16,6 @@ RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.l
...
@@ -15,7 +16,6 @@ RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.l
RUN
wget
--no-check-certificate
-qO
- https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN
wget
--no-check-certificate
-qO
- https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN
sh
-c
"echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
RUN
sh
-c
"echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
# ADD requirements.txt requirements.txt
# Install dependencies
# Install dependencies
RUN
apt-get update
&&
DEBIAN_FRONTEND
=
noninteractive apt-get
install
-y
--allow-unauthenticated
\
RUN
apt-get update
&&
DEBIAN_FRONTEND
=
noninteractive apt-get
install
-y
--allow-unauthenticated
\
apt-utils
\
apt-utils
\
...
@@ -23,8 +23,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
...
@@ -23,8 +23,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
cmake-data
=
3.15.1-0kitware1
\
cmake-data
=
3.15.1-0kitware1
\
cmake
=
3.15.1-0kitware1
\
cmake
=
3.15.1-0kitware1
\
curl
\
curl
\
g++
\
gdb
\
git
\
git
\
hip-rocclr
\
hip-rocclr
\
jq
\
jq
\
...
@@ -61,17 +59,7 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
...
@@ -61,17 +59,7 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
RUN
wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
RUN
wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
RUN
dpkg
-i
dumb-init_
*
.deb
&&
rm
dumb-init_
*
.deb
RUN
dpkg
-i
dumb-init_
*
.deb
&&
rm
dumb-init_
*
.deb
# Install cget
RUN
pip
install
cget
# Install rclone
RUN
pip
install
https://github.com/pfultz2/rclone/archive/master.tar.gz
ARG
PREFIX=/opt/rocm
ARG
PREFIX=/opt/rocm
# Install dependencies
RUN
cget
install
pfultz2/rocm-recipes
# Install rbuild
RUN
pip3
install
https://github.com/RadeonOpenCompute/rbuild/archive/6d78a0553babdaea8d2da5de15cbda7e869594b8.tar.gz
# Install packages for processing the performance results
# Install packages for processing the performance results
RUN
pip3
install
--upgrade
pip
RUN
pip3
install
--upgrade
pip
RUN
pip3
install
sqlalchemy
RUN
pip3
install
sqlalchemy
...
@@ -84,12 +72,26 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
...
@@ -84,12 +72,26 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
ENV
LC_ALL=C.UTF-8
ENV
LC_ALL=C.UTF-8
ENV
LANG=C.UTF-8
ENV
LANG=C.UTF-8
ADD
rbuild.ini /rbuild.ini
ADD
dev-requirements.txt dev-requirements.txt
ADD
dev-requirements.txt dev-requirements.txt
RUN
rbuild prepare
-s
develop
-d
$PREFIX
RUN
groupadd
-f
render
RUN
groupadd
-f
render
# Install the new rocm-cmake version
# Install the new rocm-cmake version
RUN
git clone
-b
master https://github.com/RadeonOpenCompute/rocm-cmake.git
&&
\
RUN
git clone
-b
master https://github.com/RadeonOpenCompute/rocm-cmake.git
&&
\
cd
rocm-cmake
&&
mkdir
build
&&
cd
build
&&
\
cd
rocm-cmake
&&
mkdir
build
&&
cd
build
&&
\
cmake ..
&&
cmake
--build
.
&&
cmake
--build
.
--target
install
cmake ..
&&
cmake
--build
.
&&
cmake
--build
.
--target
install
WORKDIR
/
ENV
compiler_version=$compiler_version
RUN
sh
-c
"echo compiler version = '
$compiler_version
'"
RUN
--mount
=
type
=
ssh
if
[
"
$compiler_version
"
!=
"release"
]
;
then
\
git clone
-b
"
$compiler_version
"
https://github.com/RadeonOpenCompute/llvm-project.git
&&
\
cd
llvm-project
&&
mkdir
build
&&
cd
build
&&
\
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/rocm/llvm
-DCMAKE_BUILD_TYPE
=
Release
-DLLVM_ENABLE_ASSERTIONS
=
1
-DLLVM_TARGETS_TO_BUILD
=
"AMDGPU;X86"
-DLLVM_ENABLE_PROJECTS
=
"clang;lld;compiler-rt"
../llvm
&&
\
make
-j
8
;
\
else
echo
"using the release compiler"
;
\
fi
#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
Jenkinsfile
View file @
aa5859e4
...
@@ -11,6 +11,96 @@ def show_node_info() {
...
@@ -11,6 +11,96 @@ def show_node_info() {
"""
"""
}
}
def
runShell
(
String
command
){
def
responseCode
=
sh
returnStatus:
true
,
script:
"${command} > tmp.txt"
def
output
=
readFile
(
file:
"tmp.txt"
)
echo
"tmp.txt contents: $output"
return
(
output
!=
""
)
}
def
getDockerImageName
(){
def
img
=
"${env.MIOPEN_IMAGE_URL}:composable_kernels_${params.COMPILER_VERSION}"
return
img
}
def
getDockerImage
(
Map
conf
=[:]){
env
.
DOCKER_BUILDKIT
=
1
def
prefixpath
=
conf
.
get
(
"prefixpath"
,
"/opt/rocm"
)
// prefix:/opt/rocm
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
// prebuilt dockers should have all the architectures enabled so one image can be used for all stages
def
no_cache
=
conf
.
get
(
"no_cache"
,
false
)
def
dockerArgs
=
"--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if
(
env
.
CCACHE_HOST
)
{
def
check_host
=
sh
(
script:
"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """
,
returnStdout:
true
).
trim
()
if
(
check_host
==
"+PONG"
)
{
echo
"FOUND CCACHE SERVER: ${CCACHE_HOST}"
}
else
{
echo
"CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
}
dockerArgs
=
dockerArgs
+
" --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
env
.
CCACHE_DIR
=
"""/tmp/ccache_store"""
env
.
CCACHE_SECONDARY_STORAGE
=
"""redis://${env.CCACHE_HOST}"""
}
if
(
no_cache
)
{
dockerArgs
=
dockerArgs
+
" --no-cache "
}
echo
"Docker Args: ${dockerArgs}"
def
image
=
getDockerImageName
()
//Check if image exists
def
retimage
try
{
echo
"Pulling down image: ${image}"
retimage
=
docker
.
image
(
"${image}"
)
retimage
.
pull
()
}
catch
(
Exception
ex
)
{
error
"Unable to locate image: ${image}"
}
return
[
retimage
,
image
]
}
def
buildDocker
(
install_prefix
){
show_node_info
()
env
.
DOCKER_BUILDKIT
=
1
checkout
scm
def
image_name
=
getDockerImageName
()
echo
"Building Docker for ${image_name}"
def
dockerArgs
=
"--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if
(
env
.
CCACHE_HOST
)
{
def
check_host
=
sh
(
script:
"""(printf "PING\\r\\n";) | nc -N ${env.CCACHE_HOST} 6379 """
,
returnStdout:
true
).
trim
()
if
(
check_host
==
"+PONG"
)
{
echo
"FOUND CCACHE SERVER: ${CCACHE_HOST}"
}
else
{
echo
"CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
}
dockerArgs
=
dockerArgs
+
" --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
env
.
CCACHE_DIR
=
"""/tmp/ccache_store"""
env
.
CCACHE_SECONDARY_STORAGE
=
"""redis://${env.CCACHE_HOST}"""
}
echo
"Build Args: ${dockerArgs}"
try
{
echo
"Checking for image: ${image_name}"
sh
"docker manifest inspect --insecure ${image_name}"
echo
"Image: ${image_name} found!! Skipping building image"
}
catch
(
Exception
ex
){
echo
"Unable to locate image: ${image_name}. Building image now"
retimage
=
docker
.
build
(
"${image_name}"
,
dockerArgs
+
' .'
)
retimage
.
push
()
}
}
def
cmake_build
(
Map
conf
=[:]){
def
cmake_build
(
Map
conf
=[:]){
def
compiler
=
conf
.
get
(
"compiler"
,
"/opt/rocm/bin/hipcc"
)
def
compiler
=
conf
.
get
(
"compiler"
,
"/opt/rocm/bin/hipcc"
)
...
@@ -60,7 +150,7 @@ def cmake_build(Map conf=[:]){
...
@@ -60,7 +150,7 @@ def cmake_build(Map conf=[:]){
"""
"""
def
setup_cmd
=
conf
.
get
(
"setup_cmd"
,
"${cmake_envs} cmake ${setup_args} .. "
)
def
setup_cmd
=
conf
.
get
(
"setup_cmd"
,
"${cmake_envs} cmake ${setup_args} .. "
)
// reduce parallelism when compiling, clang uses too much memory
// reduce parallelism when compiling, clang uses too much memory
def
build_cmd
=
conf
.
get
(
"build_cmd"
,
"${build_envs} dumb-init make -j\$(( \$(nproc) /
1
)) ${config_targets}"
)
def
build_cmd
=
conf
.
get
(
"build_cmd"
,
"${build_envs} dumb-init make -j\$(( \$(nproc) /
2
)) ${config_targets}"
)
def
execute_cmd
=
conf
.
get
(
"execute_cmd"
,
""
)
def
execute_cmd
=
conf
.
get
(
"execute_cmd"
,
""
)
def
cmd
=
conf
.
get
(
"cmd"
,
"""
def
cmd
=
conf
.
get
(
"cmd"
,
"""
...
@@ -85,7 +175,7 @@ def buildHipClangJob(Map conf=[:]){
...
@@ -85,7 +175,7 @@ def buildHipClangJob(Map conf=[:]){
env
.
HSA_ENABLE_SDMA
=
0
env
.
HSA_ENABLE_SDMA
=
0
checkout
scm
checkout
scm
def
image
=
"composable_kernels"
def
image
=
"composable_kernels
_${params.COMPILER_VERSION}
"
def
prefixpath
=
conf
.
get
(
"prefixpath"
,
"/opt/rocm"
)
def
prefixpath
=
conf
.
get
(
"prefixpath"
,
"/opt/rocm"
)
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
...
@@ -93,22 +183,31 @@ def buildHipClangJob(Map conf=[:]){
...
@@ -93,22 +183,31 @@ def buildHipClangJob(Map conf=[:]){
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def
dockerOpts
=
"--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def
dockerOpts
=
"--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1"
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
}
//def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if
(
params
.
COMPILER_VERSION
!=
"release"
){
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
def
variant
=
env
.
STAGE_NAME
def
variant
=
env
.
STAGE_NAME
def
retimage
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
if
(
params
.
USE_DOCKERFILE
){
try
{
try
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
//retimage = docker.build("${image}", dockerArgs + '.')
(
retimage
,
image
)
=
getDockerImage
(
conf
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
timeout
(
time:
5
,
unit:
'MINUTES'
){
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
throw
new
Exception
(
"GPU not found"
)
}
else
{
echo
"GPU is OK"
}
}
}
}
}
}
}
...
@@ -117,27 +216,23 @@ def buildHipClangJob(Map conf=[:]){
...
@@ -117,27 +216,23 @@ def buildHipClangJob(Map conf=[:]){
throw
e
throw
e
}
}
catch
(
Exception
ex
)
{
catch
(
Exception
ex
)
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"--no-cache ."
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"
--no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
timeout
(
time:
5
,
unit:
'MINUTES'
){
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
throw
new
Exception
(
"GPU not found"
)
}
}
else
{
echo
"GPU is OK"
}
}
}
}
}
}
else
{
timeout
(
time:
3
,
unit:
'HOURS'
){
retimage
=
docker
.
image
(
'compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54'
).
pull
()
image
=
"b56f8ac0d6ea"
sh
"docker images"
}
}
}
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
timeout
(
time:
5
,
unit:
'HOURS'
)
timeout
(
time:
5
,
unit:
'HOURS'
)
{
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
cmake_build
(
conf
)
cmake_build
(
conf
)
}
}
}
}
...
@@ -149,10 +244,6 @@ def reboot(){
...
@@ -149,10 +244,6 @@ def reboot(){
build
job:
'reboot-slaves'
,
propagate:
false
,
parameters:
[
string
(
name:
'server'
,
value:
"${env.NODE_NAME}"
),]
build
job:
'reboot-slaves'
,
propagate:
false
,
parameters:
[
string
(
name:
'server'
,
value:
"${env.NODE_NAME}"
),]
}
}
def
buildHipClangJobAndReboot
(
Map
conf
=[:]){
def
buildHipClangJobAndReboot
(
Map
conf
=[:]){
try
{
try
{
buildHipClangJob
(
conf
)
buildHipClangJob
(
conf
)
...
@@ -169,14 +260,14 @@ def buildHipClangJobAndReboot(Map conf=[:]){
...
@@ -169,14 +260,14 @@ def buildHipClangJobAndReboot(Map conf=[:]){
}
}
}
}
def
runCKProfiler
(
Map
conf
=[:]){
def
runCKProfiler
(
Map
conf
=[:]){
show_node_info
()
show_node_info
()
env
.
HSA_ENABLE_SDMA
=
0
env
.
HSA_ENABLE_SDMA
=
0
checkout
scm
checkout
scm
def
image
=
"composable_kernels"
def
image
=
"composable_kernels_${params.COMPILER_VERSION}"
def
prefixpath
=
conf
.
get
(
"prefixpath"
,
"/opt/rocm"
)
def
prefixpath
=
conf
.
get
(
"prefixpath"
,
"/opt/rocm"
)
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
...
@@ -184,22 +275,29 @@ def runCKProfiler(Map conf=[:]){
...
@@ -184,22 +275,29 @@ def runCKProfiler(Map conf=[:]){
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def
dockerOpts
=
"--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def
dockerOpts
=
"--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1"
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if
(
params
.
COMPILER_VERSION
!=
"release"
){
dockerOpts
=
dockerOpts
+
" --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
def
variant
=
env
.
STAGE_NAME
def
variant
=
env
.
STAGE_NAME
def
retimage
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
if
(
params
.
USE_DOCKERFILE
){
try
{
try
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
'.'
)
//retimage = docker.build("${image}", dockerArgs + '.')
(
retimage
,
image
)
=
getDockerImage
(
conf
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
timeout
(
time:
5
,
unit:
'MINUTES'
){
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
throw
new
Exception
(
"GPU not found"
)
}
else
{
echo
"GPU is OK"
}
}
}
}
}
}
}
...
@@ -208,74 +306,61 @@ def runCKProfiler(Map conf=[:]){
...
@@ -208,74 +306,61 @@ def runCKProfiler(Map conf=[:]){
throw
e
throw
e
}
}
catch
(
Exception
ex
)
{
catch
(
Exception
ex
)
{
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"--no-cache ."
)
retimage
=
docker
.
build
(
"${image}"
,
dockerArgs
+
"
--no-cache ."
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
timeout
(
time:
5
,
unit:
'MINUTES'
)
timeout
(
time:
5
,
unit:
'MINUTES'
){
{
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
sh
'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
if
(
runShell
(
'grep -n "Number of devices:.*. 0" clinfo.log'
)
){
throw
new
Exception
(
"GPU not found"
)
}
}
else
{
echo
"GPU is OK"
}
}
}
}
}
}
else
{
timeout
(
time:
3
,
unit:
'HOURS'
){
retimage
=
docker
.
image
(
'compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54'
).
pull
()
image
=
"b56f8ac0d6ea"
sh
"docker images"
}
}
}
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
timeout
(
time:
5
,
unit:
'HOURS'
)
timeout
(
time:
24
,
unit:
'HOURS'
)
{
{
cmake_build
(
conf
)
cmake_build
(
conf
)
dir
(
"script"
){
dir
(
"script"
){
//run gemm performance tests
if
(
params
.
RUN_FULL_QA
){
def
gemm_log
=
"perf_gemm_${gpu_arch}.log"
def
qa_log
=
"qa_${gpu_arch}.log"
sh
"rm -f ${gemm_log}"
sh
"./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
sh
"echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
archiveArtifacts
"perf_gemm_${gpu_arch}.log"
sh
"echo Node name: ${NODE_NAME} >> ${gemm_log}"
archiveArtifacts
"perf_resnet50_N256_${gpu_arch}.log"
sh
"echo GPU_arch name: ${gpu_arch} >> ${gemm_log}"
archiveArtifacts
"perf_resnet50_N4_${gpu_arch}.log"
sh
"rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
archiveArtifacts
"perf_batched_gemm_${gpu_arch}.log"
sh
"hipcc --version | grep -e 'HIP version' >> ${gemm_log}"
archiveArtifacts
"perf_grouped_gemm_${gpu_arch}.log"
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
archiveArtifacts
"perf_conv_fwd_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
archiveArtifacts
"perf_conv_bwd_data_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
archiveArtifacts
"perf_gemm_bilinear_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}"
archiveArtifacts
"perf_reduction_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}"
// stash perf files to master
sh
"./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_gemm_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_resnet50_N256_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_resnet50_N4_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_batched_gemm_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_grouped_gemm_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_conv_fwd_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_conv_bwd_data_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_gemm_bilinear_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}"
stash
name:
"perf_reduction_${gpu_arch}.log"
sh
"./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}"
//we will process results on the master node
sh
"./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}"
}
sh
"./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}"
else
{
//results will be parsed, stored, and analyzed within the python script
sh
"./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
//the script will return 0 if the performance criteria are met
archiveArtifacts
"perf_gemm_${gpu_arch}.log"
//or return 1 if the criteria are not met
archiveArtifacts
"perf_resnet50_N256_${gpu_arch}.log"
archiveArtifacts
"${gemm_log}"
archiveArtifacts
"perf_resnet50_N4_${gpu_arch}.log"
sh
"python3 parse_perf_data.py ${gemm_log} "
// stash perf files to master
//run resnet50 test
stash
name:
"perf_gemm_${gpu_arch}.log"
def
resnet_log
=
"perf_resnet50_${gpu_arch}.log"
stash
name:
"perf_resnet50_N256_${gpu_arch}.log"
sh
"rm -f ${resnet_log}"
stash
name:
"perf_resnet50_N4_${gpu_arch}.log"
sh
"echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}"
//we will process the results on the master node
sh
"echo Node name: ${NODE_NAME} >> ${resnet_log}"
}
sh
"echo GPU_arch name: ${gpu_arch} >> ${resnet_log}"
sh
"rocminfo | grep 'Compute Unit:' >> ${resnet_log} "
sh
"hipcc --version | grep -e 'HIP version' >> ${resnet_log}"
sh
"/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}"
//first run tests with N=256
sh
"./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}"
//then run with N=4
sh
"./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}"
archiveArtifacts
"${resnet_log}"
//the script will put the results from N=256 and N=4 runs into separate tables
sh
"python3 parse_perf_data.py ${resnet_log} "
}
}
}
}
}
}
...
@@ -283,7 +368,6 @@ def runCKProfiler(Map conf=[:]){
...
@@ -283,7 +368,6 @@ def runCKProfiler(Map conf=[:]){
return
retimage
return
retimage
}
}
def
runPerfTest
(
Map
conf
=[:]){
def
runPerfTest
(
Map
conf
=[:]){
try
{
try
{
runCKProfiler
(
conf
)
runCKProfiler
(
conf
)
...
@@ -300,16 +384,97 @@ def runPerfTest(Map conf=[:]){
...
@@ -300,16 +384,97 @@ def runPerfTest(Map conf=[:]){
}
}
}
}
def
process_results
(
Map
conf
=[:]){
env
.
HSA_ENABLE_SDMA
=
0
checkout
scm
def
image
=
"composable_kernels_${params.COMPILER_VERSION}"
def
prefixpath
=
"/opt/rocm"
def
gpu_arch
=
conf
.
get
(
"gpu_arch"
,
"gfx908"
)
// Jenkins is complaining about the render group
def
dockerOpts
=
"--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if
(
conf
.
get
(
"enforce_xnack_on"
,
false
))
{
dockerOpts
=
dockerOpts
+
" --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
}
def
dockerArgs
=
"--build-arg PREFIX=${prefixpath} --build-arg compiler_version='release' "
def
variant
=
env
.
STAGE_NAME
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
try
{
//retimage = docker.build("${image}", dockerArgs + '.')
(
retimage
,
image
)
=
getDockerImage
(
conf
)
}
catch
(
org
.
jenkinsci
.
plugins
.
workflow
.
steps
.
FlowInterruptedException
e
){
echo
"The job was cancelled or aborted"
throw
e
}
}
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
timeout
(
time:
1
,
unit:
'HOURS'
){
try
{
dir
(
"script"
){
if
(
params
.
RUN_FULL_QA
){
// unstash perf files to master
unstash
"perf_gemm_${gpu_arch}.log"
unstash
"perf_resnet50_N256_${gpu_arch}.log"
unstash
"perf_resnet50_N4_${gpu_arch}.log"
unstash
"perf_batched_gemm_${gpu_arch}.log"
unstash
"perf_grouped_gemm_${gpu_arch}.log"
unstash
"perf_conv_fwd_${gpu_arch}.log"
unstash
"perf_conv_bwd_data_${gpu_arch}.log"
unstash
"perf_gemm_bilinear_${gpu_arch}.log"
unstash
"perf_reduction_${gpu_arch}.log"
sh
"./process_qa_data.sh ${gpu_arch}"
}
else
{
// unstash perf files to master
unstash
"perf_gemm_${gpu_arch}.log"
unstash
"perf_resnet50_N256_${gpu_arch}.log"
unstash
"perf_resnet50_N4_${gpu_arch}.log"
sh
"./process_perf_data.sh ${gpu_arch}"
}
}
}
catch
(
e
){
echo
"throwing error exception while processing performance test results"
echo
'Exception occurred: '
+
e
.
toString
()
throw
e
}
}
}
}
//launch develop branch daily at 23:00 in FULL_QA mode
CRON_SETTINGS
=
BRANCH_NAME
==
"develop"
?
'''0 23 * * * % RUN_FULL_QA=true'''
:
""
pipeline
{
pipeline
{
agent
none
agent
none
triggers
{
parameterizedCron
(
CRON_SETTINGS
)
}
options
{
options
{
parallelsAlwaysFailFast
()
parallelsAlwaysFailFast
()
}
}
parameters
{
parameters
{
booleanParam
(
booleanParam
(
name:
"
USE
_DOCKER
FILE
"
,
name:
"
BUILD
_DOCKER"
,
defaultValue:
true
,
defaultValue:
true
,
description:
""
)
description:
"Force building docker image (default: true)"
)
string
(
name:
'COMPILER_VERSION'
,
defaultValue:
'ck-9110'
,
description:
'Specify which version of compiler to use: ck-9110 (default), release, or amd-stg-open.'
)
booleanParam
(
name:
"RUN_FULL_QA"
,
defaultValue:
false
,
description:
"Select whether to run small set of performance tests (default) or full QA"
)
booleanParam
(
name:
"TEST_NODE_PERFORMANCE"
,
defaultValue:
false
,
description:
"Test the node GPU performance (default: false)"
)
}
}
environment
{
environment
{
dbuser
=
"${dbuser}"
dbuser
=
"${dbuser}"
...
@@ -319,9 +484,28 @@ pipeline {
...
@@ -319,9 +484,28 @@ pipeline {
dbsshuser
=
"${dbsshuser}"
dbsshuser
=
"${dbsshuser}"
dbsshpassword
=
"${dbsshpassword}"
dbsshpassword
=
"${dbsshpassword}"
status_wrapper_creds
=
"${status_wrapper_creds}"
status_wrapper_creds
=
"${status_wrapper_creds}"
gerrit_cred
=
"${gerrit_cred}"
DOCKER_BUILDKIT
=
"1"
}
}
stages
{
stages
{
stage
(
"Build Docker"
){
when
{
expression
{
params
.
BUILD_DOCKER
.
toBoolean
()
}
}
parallel
{
stage
(
'Docker /opt/rocm'
){
agent
{
label
rocmnode
(
"nogpu"
)
}
steps
{
buildDocker
(
'/opt/rocm'
)
}
}
}
}
stage
(
"Static checks"
)
{
stage
(
"Static checks"
)
{
when
{
beforeAgent
true
expression
{
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
parallel
{
parallel
{
// enable after we move from hipcc to hip-clang
// enable after we move from hipcc to hip-clang
// stage('Tidy') {
// stage('Tidy') {
...
@@ -355,6 +539,10 @@ pipeline {
...
@@ -355,6 +539,10 @@ pipeline {
}
}
stage
(
"Tests"
)
stage
(
"Tests"
)
{
{
when
{
beforeAgent
true
expression
{
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
parallel
parallel
{
{
stage
(
"Run Tests: gfx908"
)
stage
(
"Run Tests: gfx908"
)
...
@@ -369,6 +557,10 @@ pipeline {
...
@@ -369,6 +557,10 @@ pipeline {
}
}
stage
(
"Run Tests: gfx90a"
)
stage
(
"Run Tests: gfx90a"
)
{
{
when
{
beforeAgent
true
expression
{
params
.
RUN_FULL_QA
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx90a"
)}
agent
{
label
rocmnode
(
"gfx90a"
)}
environment
{
environment
{
setup_args
=
""" -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
setup_args
=
""" -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
...
@@ -381,6 +573,10 @@ pipeline {
...
@@ -381,6 +573,10 @@ pipeline {
}
}
stage
(
"Client App"
)
stage
(
"Client App"
)
{
{
when
{
beforeAgent
true
expression
{
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
parallel
parallel
{
{
stage
(
"Run Client App"
)
stage
(
"Run Client App"
)
...
@@ -402,6 +598,10 @@ pipeline {
...
@@ -402,6 +598,10 @@ pipeline {
{
{
stage
(
"Run ckProfiler: gfx908"
)
stage
(
"Run ckProfiler: gfx908"
)
{
{
when
{
beforeAgent
true
expression
{
!
params
.
RUN_FULL_QA
.
toBoolean
()
&&
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx908"
)}
agent
{
label
rocmnode
(
"gfx908"
)}
environment
{
environment
{
setup_args
=
""" -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
setup_args
=
""" -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
...
@@ -412,6 +612,10 @@ pipeline {
...
@@ -412,6 +612,10 @@ pipeline {
}
}
stage
(
"Run ckProfiler: gfx90a"
)
stage
(
"Run ckProfiler: gfx90a"
)
{
{
when
{
beforeAgent
true
expression
{
params
.
RUN_FULL_QA
.
toBoolean
()
||
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx90a"
)}
agent
{
label
rocmnode
(
"gfx90a"
)}
environment
{
environment
{
setup_args
=
""" -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
setup_args
=
""" -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
...
@@ -422,6 +626,33 @@ pipeline {
...
@@ -422,6 +626,33 @@ pipeline {
}
}
}
}
}
}
stage
(
"Process Performance Test Results"
)
{
parallel
{
stage
(
"Process results for gfx908"
){
when
{
beforeAgent
true
expression
{
!
params
.
RUN_FULL_QA
.
toBoolean
()
&&
!
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
agent
{
label
'mici'
}
steps
{
process_results
(
gpu_arch:
"gfx908"
)
}
}
stage
(
"Process results for gfx90a"
){
when
{
beforeAgent
true
expression
{
params
.
RUN_FULL_QA
.
toBoolean
()
||
params
.
TEST_NODE_PERFORMANCE
.
toBoolean
()
}
}
agent
{
label
'mici'
}
steps
{
process_results
(
gpu_arch:
"gfx90a"
)
}
}
}
}
/* enable after the cmake file supports packaging
/* enable after the cmake file supports packaging
stage("Packages") {
stage("Packages") {
when {
when {
...
...
README.md
View file @
aa5859e4
...
@@ -10,7 +10,7 @@ rocm/tensorflow:rocm5.1-tf2.6-dev \
...
@@ -10,7 +10,7 @@ rocm/tensorflow:rocm5.1-tf2.6-dev \
/bin/bash
/bin/bash
```
```
# Install
the new
rocm-cmake
version
# Install
newer version of
rocm-cmake
https://github.com/RadeonOpenCompute/rocm-cmake
https://github.com/RadeonOpenCompute/rocm-cmake
## Build
## Build
...
@@ -26,6 +26,7 @@ cmake \
...
@@ -26,6 +26,7 @@ cmake \
-D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx908 --offload-arch=gfx90a -O3"
\
-D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx908 --offload-arch=gfx90a -O3"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_INSTALL_PREFIX
=
${
PATH_TO_CK_INSTALL_DIRECTORY
}
\
..
..
```
```
...
@@ -47,6 +48,13 @@ Instructions for running each individual examples are under ```example/```
...
@@ -47,6 +48,13 @@ Instructions for running each individual examples are under ```example/```
```
```
Instructions for running ckProfiler are under
```profiler/```
Instructions for running ckProfiler are under
```profiler/```
## Install CK
```
bash
make
install
```
## Using CK as pre-built kernel library
Instructions for using CK as a pre-built kernel library are under
```client_example/```
## Caveat
## Caveat
### Kernel Timing and Verification
### Kernel Timing and Verification
...
...
client_example/01_gemm/CMakeLists.txt
0 → 100644
View file @
aa5859e4
add_executable
(
client_gemm gemm.cpp
)
target_link_libraries
(
client_gemm PRIVATE composable_kernel::device_operations
)
client_example/01_gemm/gemm.cpp
0 → 100644
View file @
aa5859e4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
CDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
CLayout
=
Row
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// GEMM shape
ck
::
index_t
M
=
3840
;
ck
::
index_t
N
=
4096
;
ck
::
index_t
K
=
4096
;
ck
::
index_t
StrideA
=
4096
;
ck
::
index_t
StrideB
=
4096
;
ck
::
index_t
StrideC
=
4096
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
7
)
{
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
K
=
std
::
stoi
(
argv
[
3
]);
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideC
=
std
::
stoi
(
argv
[
6
]);
}
else
{
printf
(
"arg1 to 6: M, N, K, StrideA, StrideB, StrideC
\n
"
);
exit
(
0
);
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
(
std
::
is_same
<
Layout
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
ALayout
{}));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
BLayout
{}));
SimpleDeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideC
,
CLayout
{}));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
c_element_op
=
CElementOp
{};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
View file @
aa5859e4
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/
device_
gemm_add_add_fastgelu
_instance
.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -27,7 +27,6 @@ using CDEElementOp = AddAddFastGelu;
...
@@ -27,7 +27,6 @@ using CDEElementOp = AddAddFastGelu;
using
ADataType
=
F16
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
BDataType
=
F16
;
using
AccDataType
=
F32
;
using
D0DataType
=
F16
;
using
D0DataType
=
F16
;
using
D1DataType
=
F16
;
using
D1DataType
=
F16
;
using
EDataType
=
F16
;
using
EDataType
=
F16
;
...
@@ -111,19 +110,22 @@ int main(int argc, char* argv[])
...
@@ -111,19 +110,22 @@ int main(int argc, char* argv[])
f_matrix_space_size
(
M
,
N
,
StrideD1
,
D1Layout
{}));
f_matrix_space_size
(
M
,
N
,
StrideD1
,
D1Layout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
// add device op instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
get_device_gemm_add_add_fastgelu_instances
<
ADataType
,
BDataType
,
AccDataType
,
D0DataType
,
D1DataType
,
EDataType
,
ALayout
,
ALayout
,
BLayout
,
BLayout
,
D0Layout
,
ck
::
Tuple
<
D0Layout
,
D1Layout
>
,
D1Layout
,
ELayout
,
ELayout
>
();
ADataType
,
BDataType
,
ck
::
Tuple
<
D0DataType
,
D1DataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
AddAddFastGelu
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
...
@@ -231,6 +233,8 @@ int main(int argc, char* argv[])
...
@@ -231,6 +233,8 @@ int main(int argc, char* argv[])
{
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
}
return
0
;
return
0
;
...
...
client_example/03_gemm_layernorm/CMakeLists.txt
View file @
aa5859e4
add_executable
(
gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp
)
add_executable
(
client_
gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp
)
target_link_libraries
(
gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_
gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations
)
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
View file @
aa5859e4
...
@@ -160,8 +160,9 @@ int main()
...
@@ -160,8 +160,9 @@ int main()
ck
::
index_t
StrideC
=
1024
;
ck
::
index_t
StrideC
=
1024
;
ck
::
index_t
StrideD0
=
1024
;
ck
::
index_t
StrideD0
=
1024
;
const
auto
gemm_reduce_ptrs
=
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
const
auto
gemm_reduce_ptrs
=
get_device_gemm_add_add_mean_squaremean_instances
<
ADataType
,
ck
::
tensor_operation
::
device
::
instance
::
get_device_gemm_add_add_mean_squaremean_instances
<
ADataType
,
BDataType
,
BDataType
,
CDataType
,
CDataType
,
ALayout
,
ALayout
,
...
@@ -169,7 +170,7 @@ int main()
...
@@ -169,7 +170,7 @@ int main()
CLayout
>
();
CLayout
>
();
const
auto
normalize_ptrs
=
const
auto
normalize_ptrs
=
ck
::
tensor_operation
::
device
::
get_device_normalize_from_mean_meansquare_instances
<
ck
::
tensor_operation
::
device
::
instance
::
get_device_normalize_from_mean_meansquare_instances
<
CDataType
,
CDataType
,
ReduceDataType
,
ReduceDataType
,
ReduceDataType
,
ReduceDataType
,
...
...
client_example/04_contraction/CMakeLists.txt
0 → 100644
View file @
aa5859e4
add_executable
(
client_contraction_scale contraction_scale.cpp
)
target_link_libraries
(
client_contraction_scale PRIVATE composable_kernel::device_operations
)
add_executable
(
client_contraction_bilinear contraction_bilinear.cpp
)
target_link_libraries
(
client_contraction_bilinear PRIVATE composable_kernel::device_operations
)
client_example/04_contraction/contraction_bilinear.cpp
0 → 100644
View file @
aa5859e4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Bilinear
=
ck
::
tensor_operation
::
element_wise
::
Bilinear
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Bilinear
;
using
ADataType
=
F32
;
using
BDataType
=
F32
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F32
;
using
DDataType
=
F32
;
using
DsDataType
=
ck
::
Tuple
<
DDataType
>
;
using
EDataType
=
F32
;
static
constexpr
ck
::
index_t
NumDimM
=
2
;
static
constexpr
ck
::
index_t
NumDimN
=
2
;
static
constexpr
ck
::
index_t
NumDimK
=
2
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
524288
,
4096
,
128
,
1
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
524288
,
4096
,
128
,
1
};
// D[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
d_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
d_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
float
alpha
=
1.
f
;
float
beta
=
1.
f
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
25
)
{
const
ck
::
index_t
M0
=
std
::
stoi
(
argv
[
1
]);
const
ck
::
index_t
M1
=
std
::
stoi
(
argv
[
2
]);
const
ck
::
index_t
N0
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
N1
=
std
::
stoi
(
argv
[
4
]);
const
ck
::
index_t
K0
=
std
::
stoi
(
argv
[
5
]);
const
ck
::
index_t
K1
=
std
::
stoi
(
argv
[
6
]);
a_ms_ks_lengths
=
{
M0
,
M1
,
K0
,
K1
};
a_ms_ks_strides
=
{
std
::
stoi
(
argv
[
7
]),
std
::
stoi
(
argv
[
8
]),
std
::
stoi
(
argv
[
9
]),
std
::
stoi
(
argv
[
10
])};
b_ns_ks_lengths
=
{
N0
,
N1
,
K0
,
K1
};
b_ns_ks_strides
=
{
std
::
stoi
(
argv
[
11
]),
std
::
stoi
(
argv
[
12
]),
std
::
stoi
(
argv
[
13
]),
std
::
stoi
(
argv
[
14
])};
d_ms_ns_lengths
=
{
M0
,
M1
,
N0
,
N1
};
d_ms_ns_strides
=
{
std
::
stoi
(
argv
[
15
]),
std
::
stoi
(
argv
[
16
]),
std
::
stoi
(
argv
[
17
]),
std
::
stoi
(
argv
[
18
])};
e_ms_ns_lengths
=
{
M0
,
M1
,
N0
,
N1
};
e_ms_ns_strides
=
{
std
::
stoi
(
argv
[
19
]),
std
::
stoi
(
argv
[
20
]),
std
::
stoi
(
argv
[
21
]),
std
::
stoi
(
argv
[
22
])};
alpha
=
std
::
stof
(
argv
[
23
]);
beta
=
std
::
stof
(
argv
[
24
]);
}
else
{
printf
(
"arg1 to 6: M0, M1, N0, N1, K0, K1
\n
"
);
printf
(
"arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1
\n
"
);
printf
(
"arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1
\n
"
);
printf
(
"arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1
\n
"
);
printf
(
"arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1
\n
"
);
printf
(
"arg23 to 24: alpha, beta
\n
"
);
exit
(
0
);
}
auto
f_tensor_space_size
=
[](
auto
lengths
,
auto
strides
)
{
std
::
size_t
space_size
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
lengths
.
size
();
++
i
)
{
space_size
+=
(
lengths
[
i
]
-
1
)
*
strides
[
i
];
}
return
space_size
;
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_tensor_space_size
(
a_ms_ks_lengths
,
a_ms_ks_strides
));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_tensor_space_size
(
b_ns_ks_lengths
,
b_ns_ks_strides
));
SimpleDeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
f_tensor_space_size
(
d_ms_ns_lengths
,
d_ms_ns_strides
));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_tensor_space_size
(
e_ms_ns_lengths
,
e_ms_ns_strides
));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
ck
::
Tuple
<
DDataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Bilinear
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{
alpha
,
beta
};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
a_ms_ks_lengths
,
a_ms_ks_strides
,
b_ns_ks_lengths
,
b_ns_ks_strides
,
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_ms_ns_lengths
},
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_ms_ns_strides
},
e_ms_ns_lengths
,
e_ms_ns_strides
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
ck
::
index_t
M
=
std
::
accumulate
(
e_ms_ns_lengths
.
begin
(),
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
ck
::
index_t
N
=
std
::
accumulate
(
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
e_ms_ns_lengths
.
begin
()
+
NumDimM
+
NumDimN
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
ck
::
index_t
K
=
std
::
accumulate
(
a_ms_ks_lengths
.
begin
()
+
NumDimM
,
a_ms_ks_lengths
.
begin
()
+
NumDimM
+
NumDimK
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
DDataType
)
*
M
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
0
;
}
client_example/04_contraction/contraction_scale.cpp
0 → 100644
View file @
aa5859e4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <numeric>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Scale
;
using
ADataType
=
F32
;
using
BDataType
=
F32
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F32
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
EDataType
=
F32
;
static
constexpr
ck
::
index_t
NumDimM
=
2
;
static
constexpr
ck
::
index_t
NumDimN
=
2
;
static
constexpr
ck
::
index_t
NumDimK
=
2
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// A[M0, M1, K0, K1]
std
::
vector
<
ck
::
index_t
>
a_ms_ks_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
a_ms_ks_strides
{
524288
,
4096
,
128
,
1
};
// B[N0, N1, K0, K1]
std
::
vector
<
ck
::
index_t
>
b_ns_ks_lengths
{
32
,
64
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
b_ns_ks_strides
{
524288
,
4096
,
128
,
1
};
// E[M0, M1, N0, N1]
std
::
vector
<
ck
::
index_t
>
e_ms_ns_lengths
{
30
,
128
,
32
,
64
};
std
::
vector
<
ck
::
index_t
>
e_ms_ns_strides
{
524288
,
4096
,
128
,
1
};
float
scale
=
1.
f
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
20
)
{
const
ck
::
index_t
M0
=
std
::
stoi
(
argv
[
1
]);
const
ck
::
index_t
M1
=
std
::
stoi
(
argv
[
2
]);
const
ck
::
index_t
N0
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
N1
=
std
::
stoi
(
argv
[
4
]);
const
ck
::
index_t
K0
=
std
::
stoi
(
argv
[
5
]);
const
ck
::
index_t
K1
=
std
::
stoi
(
argv
[
6
]);
a_ms_ks_lengths
=
{
M0
,
M1
,
K0
,
K1
};
a_ms_ks_strides
=
{
std
::
stoi
(
argv
[
7
]),
std
::
stoi
(
argv
[
8
]),
std
::
stoi
(
argv
[
9
]),
std
::
stoi
(
argv
[
10
])};
b_ns_ks_lengths
=
{
N0
,
N1
,
K0
,
K1
};
b_ns_ks_strides
=
{
std
::
stoi
(
argv
[
11
]),
std
::
stoi
(
argv
[
12
]),
std
::
stoi
(
argv
[
13
]),
std
::
stoi
(
argv
[
14
])};
e_ms_ns_lengths
=
{
M0
,
M1
,
N0
,
N1
};
e_ms_ns_strides
=
{
std
::
stoi
(
argv
[
15
]),
std
::
stoi
(
argv
[
16
]),
std
::
stoi
(
argv
[
17
]),
std
::
stoi
(
argv
[
18
])};
scale
=
std
::
stof
(
argv
[
19
]);
}
else
{
printf
(
"arg1 to 6: M0, M1, N0, N1, K0, K1
\n
"
);
printf
(
"arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1
\n
"
);
printf
(
"arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1
\n
"
);
printf
(
"arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1
\n
"
);
printf
(
"arg19: scale
\n
"
);
exit
(
0
);
}
auto
f_tensor_space_size
=
[](
auto
lengths
,
auto
strides
)
{
std
::
size_t
space_size
=
1
;
for
(
std
::
size_t
i
=
0
;
i
<
lengths
.
size
();
++
i
)
{
space_size
+=
(
lengths
[
i
]
-
1
)
*
strides
[
i
];
}
return
space_size
;
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_tensor_space_size
(
a_ms_ks_lengths
,
a_ms_ks_strides
));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_tensor_space_size
(
b_ns_ks_lengths
,
b_ns_ks_strides
));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_tensor_space_size
(
e_ms_ns_lengths
,
e_ms_ns_strides
));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
ck
::
Tuple
<>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Scale
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{
scale
};
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
0
>
{},
e_device_buf
.
GetDeviceBuffer
(),
a_ms_ks_lengths
,
a_ms_ks_strides
,
b_ns_ks_lengths
,
b_ns_ks_strides
,
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
0
>
{},
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
0
>
{},
e_ms_ns_lengths
,
e_ms_ns_strides
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
ck
::
index_t
M
=
std
::
accumulate
(
e_ms_ns_lengths
.
begin
(),
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
ck
::
index_t
N
=
std
::
accumulate
(
e_ms_ns_lengths
.
begin
()
+
NumDimM
,
e_ms_ns_lengths
.
begin
()
+
NumDimM
+
NumDimN
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
ck
::
index_t
K
=
std
::
accumulate
(
a_ms_ks_lengths
.
begin
()
+
NumDimM
,
a_ms_ks_lengths
.
begin
()
+
NumDimM
+
NumDimK
,
ck
::
index_t
{
1
},
std
::
multiplies
<
ck
::
index_t
>
{});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
0
;
}
client_example/CMakeLists.txt
View file @
aa5859e4
...
@@ -6,5 +6,7 @@ find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
...
@@ -6,5 +6,7 @@ find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
find_package
(
hip REQUIRED PATHS /opt/rocm
)
find_package
(
hip REQUIRED PATHS /opt/rocm
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
message
(
STATUS
"Build with HIP
${
hip_VERSION
}
"
)
add_subdirectory
(
01_gemm
)
add_subdirectory
(
02_gemm_add_add_fastgelu
)
add_subdirectory
(
02_gemm_add_add_fastgelu
)
add_subdirectory
(
03_gemm_layernorm
)
add_subdirectory
(
03_gemm_layernorm
)
add_subdirectory
(
04_contraction
)
client_example/README.md
View file @
aa5859e4
##
##
Client application links to CK library, and therefore CK library needs to be installed before building client applications.
Client application links to CK library, and therefore CK library needs to be installed before building client applications.
## Docker script
```
bash
docker run
\
-it
\
--privileged
\
--group-add
sudo
\
-w
/root/workspace
\
-v
${
PATH_TO_LOCAL_WORKSPACE
}
:/root/workspace
\
rocm/tensorflow:rocm5.1-tf2.6-dev
\
/bin/bash
```
## Build
## Build
```
bash
```
bash
...
@@ -22,7 +11,7 @@ cd client_example/build
...
@@ -22,7 +11,7 @@ cd client_example/build
```
bash
```
bash
cmake
\
cmake
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_PREFIX_PATH
=
"
/opt/rocm
;
${
PATH_TO_CK_INSTALL_DIRECTORY
}
"
\
..
..
```
```
...
...
cmake/googletest.cmake
View file @
aa5859e4
...
@@ -20,6 +20,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
...
@@ -20,6 +20,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
-Wno-unused-member-function
-Wno-unused-member-function
-Wno-comma
-Wno-comma
-Wno-old-style-cast
-Wno-old-style-cast
-Wno-deprecated
)
)
message
(
STATUS
"Suppressing googltest warnings with flags:
${
GTEST_CMAKE_CXX_FLAGS
}
"
)
message
(
STATUS
"Suppressing googltest warnings with flags:
${
GTEST_CMAKE_CXX_FLAGS
}
"
)
...
...
example/01_gemm/CMakeLists.txt
View file @
aa5859e4
...
@@ -4,5 +4,6 @@ add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
...
@@ -4,5 +4,6 @@ add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
add_example_executable
(
example_gemm_xdl_fp16 gemm_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_xdl_fp16 gemm_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_xdl_bf16 gemm_xdl_bf16.cpp
)
add_example_executable
(
example_gemm_xdl_bf16 gemm_xdl_bf16.cpp
)
add_example_executable
(
example_gemm_xdl_int8 gemm_xdl_int8.cpp
)
add_example_executable
(
example_gemm_xdl_int8 gemm_xdl_int8.cpp
)
add_example_executable
(
example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp
)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing
(
example_gemm_xdl_fp64 gemm_xdl_fp64.cpp
)
add_example_executable_no_testing
(
example_gemm_xdl_fp64 gemm_xdl_fp64.cpp
)
example/01_gemm/gemm_dl_fp16.cpp
View file @
aa5859e4
...
@@ -12,9 +12,9 @@
...
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/
host_tensor
/device_memory.hpp"
#include "ck/library/
utility
/device_memory.hpp"
#include "ck/library/
host_tensor
/host_tensor.hpp"
#include "ck/library/
utility
/host_tensor.hpp"
#include "ck/library/
host_tensor
/host_tensor_generator.hpp"
#include "ck/library/
utility
/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
...
@@ -142,9 +142,9 @@ int main(int argc, char* argv[])
...
@@ -142,9 +142,9 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
}
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
example/01_gemm/gemm_dl_fp32.cpp
View file @
aa5859e4
...
@@ -12,9 +12,9 @@
...
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/
host_tensor
/device_memory.hpp"
#include "ck/library/
utility
/device_memory.hpp"
#include "ck/library/
host_tensor
/host_tensor.hpp"
#include "ck/library/
utility
/host_tensor.hpp"
#include "ck/library/
host_tensor
/host_tensor_generator.hpp"
#include "ck/library/
utility
/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
...
@@ -141,9 +141,9 @@ int main(int argc, char* argv[])
...
@@ -141,9 +141,9 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
}
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
example/01_gemm/gemm_dl_int8.cpp
View file @
aa5859e4
...
@@ -12,9 +12,9 @@
...
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/
host_tensor
/device_memory.hpp"
#include "ck/library/
utility
/device_memory.hpp"
#include "ck/library/
host_tensor
/host_tensor.hpp"
#include "ck/library/
utility
/host_tensor.hpp"
#include "ck/library/
host_tensor
/host_tensor_generator.hpp"
#include "ck/library/
utility
/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
...
@@ -139,9 +139,9 @@ int main(int argc, char* argv[])
...
@@ -139,9 +139,9 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
}
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
example/01_gemm/gemm_xdl_bf16.cpp
View file @
aa5859e4
...
@@ -11,9 +11,9 @@
...
@@ -11,9 +11,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/library/
host_tensor
/device_memory.hpp"
#include "ck/library/
utility
/device_memory.hpp"
#include "ck/library/
host_tensor
/host_tensor.hpp"
#include "ck/library/
utility
/host_tensor.hpp"
#include "ck/library/
host_tensor
/host_tensor_generator.hpp"
#include "ck/library/
utility
/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
...
@@ -170,9 +170,9 @@ int main(int argc, char* argv[])
...
@@ -170,9 +170,9 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
}
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
Prev
1
2
3
4
5
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment