Commit aea62819 authored by Chaitanya Inumella's avatar Chaitanya Inumella
Browse files

Rebase branch 'develop' of...

Rebase branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into contraction_hipTENSOR
parents 75af5450 75ab874e
...@@ -2,6 +2,7 @@ FROM ubuntu:18.04 ...@@ -2,6 +2,7 @@ FROM ubuntu:18.04
ARG ROCMVERSION=5.1 ARG ROCMVERSION=5.1
ARG OSDB_BKC_VERSION ARG OSDB_BKC_VERSION
ARG compiler_version
RUN set -xe RUN set -xe
...@@ -23,8 +24,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -23,8 +24,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
cmake-data=3.15.1-0kitware1 \ cmake-data=3.15.1-0kitware1 \
cmake=3.15.1-0kitware1 \ cmake=3.15.1-0kitware1 \
curl \ curl \
g++ \
gdb \
git \ git \
hip-rocclr \ hip-rocclr \
jq \ jq \
...@@ -61,17 +60,7 @@ ENV UBSAN_OPTIONS=print_stacktrace=1 ...@@ -61,17 +60,7 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
# Install cget
RUN pip install cget
# Install rclone
RUN pip install https://github.com/pfultz2/rclone/archive/master.tar.gz
ARG PREFIX=/opt/rocm ARG PREFIX=/opt/rocm
# Install dependencies
RUN cget install pfultz2/rocm-recipes
# Install rbuild
RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/6d78a0553babdaea8d2da5de15cbda7e869594b8.tar.gz
# Install packages for processing the performance results # Install packages for processing the performance results
RUN pip3 install --upgrade pip RUN pip3 install --upgrade pip
RUN pip3 install sqlalchemy RUN pip3 install sqlalchemy
...@@ -84,12 +73,26 @@ ENV UBSAN_OPTIONS=print_stacktrace=1 ...@@ -84,12 +73,26 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
ENV LC_ALL=C.UTF-8 ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
ADD rbuild.ini /rbuild.ini
ADD dev-requirements.txt dev-requirements.txt ADD dev-requirements.txt dev-requirements.txt
RUN rbuild prepare -s develop -d $PREFIX
RUN groupadd -f render RUN groupadd -f render
# Install the new rocm-cmake version # Install the new rocm-cmake version
RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git && \ RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git && \
cd rocm-cmake && mkdir build && cd build && \ cd rocm-cmake && mkdir build && cd build && \
cmake .. && cmake --build . && cmake --build . --target install cmake .. && cmake --build . && cmake --build . --target install
WORKDIR /
ENV compiler_version=$compiler_version
RUN sh -c "echo compiler version = '$compiler_version'"
RUN if [ "$compiler_version" = "9110" ]; then \
git clone -b ck-9110 https://github.com/RadeonOpenCompute/llvm-project.git && \
cd llvm-project && mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
make -j 8 ; \
else echo "using the release compiler"; \
fi
#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
...@@ -11,6 +11,13 @@ def show_node_info() { ...@@ -11,6 +11,13 @@ def show_node_info() {
""" """
} }
def runShell(String command){
def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
def output = readFile(file: "tmp.txt")
echo "tmp.txt contents: $output"
return (output != "")
}
def cmake_build(Map conf=[:]){ def cmake_build(Map conf=[:]){
def compiler = conf.get("compiler","/opt/rocm/bin/hipcc") def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
...@@ -60,7 +67,7 @@ def cmake_build(Map conf=[:]){ ...@@ -60,7 +67,7 @@ def cmake_build(Map conf=[:]){
""" """
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
// reduce parallelism when compiling, clang uses too much memory // reduce parallelism when compiling, clang uses too much memory
def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 1 )) ${config_targets}") def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 2 )) ${config_targets}")
def execute_cmd = conf.get("execute_cmd", "") def execute_cmd = conf.get("execute_cmd", "")
def cmd = conf.get("cmd", """ def cmd = conf.get("cmd", """
...@@ -95,49 +102,56 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -95,49 +102,56 @@ def buildHipClangJob(Map conf=[:]){
if (conf.get("enforce_xnack_on", false)) { if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1" dockerOpts = dockerOpts + " --env HSA_XNACK=1"
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' " def dockerArgs
if (params.USE_9110){
dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='9110' "
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
else{
dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
}
def variant = env.STAGE_NAME def variant = env.STAGE_NAME
def retimage def retimage
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
if (params.USE_DOCKERFILE){ try {
try { retimage = docker.build("${image}", dockerArgs + '.')
retimage = docker.build("${image}", dockerArgs + '.') withDockerContainer(image: image, args: dockerOpts) {
withDockerContainer(image: image, args: dockerOpts) { timeout(time: 5, unit: 'MINUTES'){
timeout(time: 5, unit: 'MINUTES') sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
{ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' throw new Exception ("GPU not found")
} }
} else{
} echo "GPU is OK"
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
echo "The job was cancelled or aborted"
throw e
}
catch(Exception ex) {
retimage = docker.build("${image}", dockerArgs + "--no-cache .")
withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES')
{
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
} }
} }
} }
} }
else{ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
timeout(time: 3, unit: 'HOURS'){ echo "The job was cancelled or aborted"
retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull() throw e
image="b56f8ac0d6ea" }
sh "docker images" catch(Exception ex) {
retimage = docker.build("${image}", dockerArgs + " --no-cache .")
withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
throw new Exception ("GPU not found")
}
else{
echo "GPU is OK"
}
}
} }
} }
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 5, unit: 'HOURS') timeout(time: 5, unit: 'HOURS')
{ {
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
cmake_build(conf) cmake_build(conf)
} }
} }
...@@ -149,10 +163,6 @@ def reboot(){ ...@@ -149,10 +163,6 @@ def reboot(){
build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),] build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
} }
def buildHipClangJobAndReboot(Map conf=[:]){ def buildHipClangJobAndReboot(Map conf=[:]){
try{ try{
buildHipClangJob(conf) buildHipClangJob(conf)
...@@ -169,7 +179,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){ ...@@ -169,7 +179,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
} }
} }
def runCKProfiler(Map conf=[:]){ def runCKProfiler(Map conf=[:]){
show_node_info() show_node_info()
...@@ -186,96 +195,103 @@ def runCKProfiler(Map conf=[:]){ ...@@ -186,96 +195,103 @@ def runCKProfiler(Map conf=[:]){
if (conf.get("enforce_xnack_on", false)) { if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1" dockerOpts = dockerOpts + " --env HSA_XNACK=1"
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' " def dockerArgs
if (params.USE_9110){
dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='9110' "
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
else{
dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
}
def variant = env.STAGE_NAME def variant = env.STAGE_NAME
def retimage def retimage
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
if (params.USE_DOCKERFILE){ try {
try { retimage = docker.build("${image}", dockerArgs + '.')
retimage = docker.build("${image}", dockerArgs + '.') withDockerContainer(image: image, args: dockerOpts) {
withDockerContainer(image: image, args: dockerOpts) { timeout(time: 5, unit: 'MINUTES'){
timeout(time: 5, unit: 'MINUTES') sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
{ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' throw new Exception ("GPU not found")
} }
} else{
} echo "GPU is OK"
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
echo "The job was cancelled or aborted"
throw e
}
catch(Exception ex) {
retimage = docker.build("${image}", dockerArgs + "--no-cache .")
withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES')
{
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
} }
} }
} }
} }
else{ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
timeout(time: 3, unit: 'HOURS'){ echo "The job was cancelled or aborted"
retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull() throw e
image="b56f8ac0d6ea" }
sh "docker images" catch(Exception ex) {
retimage = docker.build("${image}", dockerArgs + " --no-cache .")
withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
throw new Exception ("GPU not found")
}
else{
echo "GPU is OK"
}
}
} }
} }
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 5, unit: 'HOURS') timeout(time: 24, unit: 'HOURS')
{ {
cmake_build(conf) cmake_build(conf)
dir("script"){ dir("script"){
//run gemm performance tests if (params.RUN_FULL_QA){
def gemm_log = "perf_gemm_${gpu_arch}.log" def qa_log = "qa_${gpu_arch}.log"
sh "rm -f ${gemm_log}" if (params.USE_9110){
sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}" sh "./run_full_performance_tests.sh 1 QA_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
sh "echo Node name: ${NODE_NAME} >> ${gemm_log}" }
sh "echo GPU_arch name: ${gpu_arch} >> ${gemm_log}" else{
sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} " sh "./run_full_performance_tests.sh 1 QA_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
sh "hipcc --version | grep -e 'HIP version' >> ${gemm_log}" }
sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}" archiveArtifacts "perf_gemm_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_batched_gemm_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_fwd_conv_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_bwd_conv_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_fusion_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}" archiveArtifacts "perf_reduction_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}" // stash perf files to master
sh "./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_gemm_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_resnet50_N256_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_resnet50_N4_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_batched_gemm_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_grouped_gemm_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_fwd_conv_${gpu_arch}.log"
sh "./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}" stash name: "perf_bwd_conv_${gpu_arch}.log"
//results will be parsed, stored, and analyzed within the python script stash name: "perf_fusion_${gpu_arch}.log"
//the script will return 0 if the performance criteria are met stash name: "perf_reduction_${gpu_arch}.log"
//or return 1 if the criteria are not met //we will process results on the master node
archiveArtifacts "${gemm_log}" }
sh "python3 parse_perf_data.py ${gemm_log} " else{
//run resnet50 test if (params.USE_9110){
def resnet_log = "perf_resnet50_${gpu_arch}.log" sh "./run_performance_tests.sh 0 CI_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
sh "rm -f ${resnet_log}" }
sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}" else{
sh "echo Node name: ${NODE_NAME} >> ${resnet_log}" sh "./run_performance_tests.sh 0 CI_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
sh "echo GPU_arch name: ${gpu_arch} >> ${resnet_log}" }
sh "rocminfo | grep 'Compute Unit:' >> ${resnet_log} " archiveArtifacts "perf_gemm_${gpu_arch}.log"
sh "hipcc --version | grep -e 'HIP version' >> ${resnet_log}" archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}" archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
//first run tests with N=256 // stash perf files to master
sh "./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}" stash name: "perf_gemm_${gpu_arch}.log"
//then run with N=4 stash name: "perf_resnet50_N256_${gpu_arch}.log"
sh "./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}" stash name: "perf_resnet50_N4_${gpu_arch}.log"
archiveArtifacts "${resnet_log}" //we will process the results on the master node
//the script will put the results from N=256 and N=4 runs into separate tables }
sh "python3 parse_perf_data.py ${resnet_log} "
} }
} }
} }
...@@ -283,7 +299,6 @@ def runCKProfiler(Map conf=[:]){ ...@@ -283,7 +299,6 @@ def runCKProfiler(Map conf=[:]){
return retimage return retimage
} }
def runPerfTest(Map conf=[:]){ def runPerfTest(Map conf=[:]){
try{ try{
runCKProfiler(conf) runCKProfiler(conf)
...@@ -300,16 +315,88 @@ def runPerfTest(Map conf=[:]){ ...@@ -300,16 +315,88 @@ def runPerfTest(Map conf=[:]){
} }
} }
def process_results(Map conf=[:]){
env.HSA_ENABLE_SDMA=0
checkout scm
def image = "composable_kernels"
def prefixpath = "/opt/rocm"
def gpu_arch = conf.get("gpu_arch", "gfx908")
// Jenkins is complaining about the render group
def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1"
}
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
def variant = env.STAGE_NAME
def retimage
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
try {
retimage = docker.build("${image}", dockerArgs + '.')
}
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
echo "The job was cancelled or aborted"
throw e
}
}
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 1, unit: 'HOURS'){
try{
dir("script"){
if (params.RUN_FULL_QA){
// unstash perf files to master
unstash "perf_gemm_${gpu_arch}.log"
unstash "perf_resnet50_N256_${gpu_arch}.log"
unstash "perf_resnet50_N4_${gpu_arch}.log"
unstash "perf_batched_gemm_${gpu_arch}.log"
unstash "perf_grouped_gemm_${gpu_arch}.log"
unstash "perf_fwd_conv_${gpu_arch}.log"
unstash "perf_bwd_conv_${gpu_arch}.log"
unstash "perf_fusion_${gpu_arch}.log"
unstash "perf_reduction_${gpu_arch}.log"
sh "./process_qa_data.sh ${gpu_arch}"
}
else{
// unstash perf files to master
unstash "perf_gemm_${gpu_arch}.log"
unstash "perf_resnet50_N256_${gpu_arch}.log"
unstash "perf_resnet50_N4_${gpu_arch}.log"
sh "./process_perf_data.sh ${gpu_arch}"
}
}
}
catch(e){
echo "throwing error exception while processing performance test results"
echo 'Exception occurred: ' + e.toString()
throw e
}
}
}
}
//launch develop branch daily at 23:00 in FULL_QA mode
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;USE_9110=true''' : ""
pipeline { pipeline {
agent none agent none
triggers {
parameterizedCron(CRON_SETTINGS)
}
options { options {
parallelsAlwaysFailFast() parallelsAlwaysFailFast()
} }
parameters { parameters {
booleanParam( booleanParam(
name: "USE_DOCKERFILE", name: "USE_9110",
defaultValue: true, defaultValue: true,
description: "") description: "Select compiler version: 9110 (default) or release")
booleanParam(
name: "RUN_FULL_QA",
defaultValue: false,
description: "Select whether to run small set of performance tests (default) or full QA")
} }
environment{ environment{
dbuser = "${dbuser}" dbuser = "${dbuser}"
...@@ -369,6 +456,10 @@ pipeline { ...@@ -369,6 +456,10 @@ pipeline {
} }
stage("Run Tests: gfx90a") stage("Run Tests: gfx90a")
{ {
when {
beforeAgent true
expression { params.RUN_FULL_QA.toBoolean() }
}
agent{ label rocmnode("gfx90a")} agent{ label rocmnode("gfx90a")}
environment{ environment{
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """ setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
...@@ -402,6 +493,10 @@ pipeline { ...@@ -402,6 +493,10 @@ pipeline {
{ {
stage("Run ckProfiler: gfx908") stage("Run ckProfiler: gfx908")
{ {
when {
beforeAgent true
expression { !params.RUN_FULL_QA.toBoolean() }
}
agent{ label rocmnode("gfx908")} agent{ label rocmnode("gfx908")}
environment{ environment{
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
...@@ -412,6 +507,10 @@ pipeline { ...@@ -412,6 +507,10 @@ pipeline {
} }
stage("Run ckProfiler: gfx90a") stage("Run ckProfiler: gfx90a")
{ {
when {
beforeAgent true
expression { params.RUN_FULL_QA.toBoolean() }
}
agent{ label rocmnode("gfx90a")} agent{ label rocmnode("gfx90a")}
environment{ environment{
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """ setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
...@@ -422,6 +521,33 @@ pipeline { ...@@ -422,6 +521,33 @@ pipeline {
} }
} }
} }
stage("Process Performance Test Results")
{
parallel
{
stage("Process results for gfx908"){
when {
beforeAgent true
expression { !params.RUN_FULL_QA.toBoolean() }
}
agent { label 'mici' }
steps{
process_results(gpu_arch: "gfx908")
}
}
stage("Process results for gfx90a"){
when {
beforeAgent true
expression { params.RUN_FULL_QA.toBoolean() }
}
agent { label 'mici' }
steps{
process_results(gpu_arch: "gfx90a")
}
}
}
}
/* enable after the cmake file supports packaging /* enable after the cmake file supports packaging
stage("Packages") { stage("Packages") {
when { when {
......
...@@ -10,7 +10,7 @@ rocm/tensorflow:rocm5.1-tf2.6-dev \ ...@@ -10,7 +10,7 @@ rocm/tensorflow:rocm5.1-tf2.6-dev \
/bin/bash /bin/bash
``` ```
# Install the new rocm-cmake version # Install newer version of rocm-cmake
https://github.com/RadeonOpenCompute/rocm-cmake https://github.com/RadeonOpenCompute/rocm-cmake
## Build ## Build
...@@ -54,6 +54,7 @@ make install ...@@ -54,6 +54,7 @@ make install
``` ```
## Using CK as pre-built kernel library ## Using CK as pre-built kernel library
Instructions for using CK as a pre-built kernel library are under ```client_example/```
## Caveat ## Caveat
### Kernel Timing and Verification ### Kernel Timing and Verification
......
...@@ -63,7 +63,7 @@ int main(int argc, char* argv[]) ...@@ -63,7 +63,7 @@ int main(int argc, char* argv[])
{ {
// use default case // use default case
} }
else if(argc == 5) else if(argc == 7)
{ {
M = std::stoi(argv[1]); M = std::stoi(argv[1]);
N = std::stoi(argv[2]); N = std::stoi(argv[2]);
......
...@@ -31,11 +31,11 @@ using D0DataType = F16; ...@@ -31,11 +31,11 @@ using D0DataType = F16;
using D1DataType = F16; using D1DataType = F16;
using EDataType = F16; using EDataType = F16;
using ALayout = Row; using ALayout = Row;
using BLayout = Col; using BLayout = Col;
using DDELayout = Row; using D0Layout = Row;
using DDELayout = Row; using D1Layout = Row;
using DELayout = Row; using ELayout = Row;
struct SimpleDeviceMem struct SimpleDeviceMem
{ {
...@@ -105,16 +105,16 @@ int main(int argc, char* argv[]) ...@@ -105,16 +105,16 @@ int main(int argc, char* argv[])
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) * SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
f_matrix_space_size(M, N, StrideD0, DDELayout{})); f_matrix_space_size(M, N, StrideD0, D0Layout{}));
SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) * SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
f_matrix_space_size(M, N, StrideD1, DDELayout{})); f_matrix_space_size(M, N, StrideD1, D1Layout{}));
SimpleDeviceMem e_device_buf(sizeof(EDataType) * SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
f_matrix_space_size(M, N, StrideE, DELayout{}));
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
ALayout, ALayout,
BLayout, BLayout,
DDELayout, ck::Tuple<D0Layout, D1Layout>,
ELayout,
ADataType, ADataType,
BDataType, BDataType,
ck::Tuple<D0DataType, D1DataType>, ck::Tuple<D0DataType, D1DataType>,
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
...@@ -142,9 +142,9 @@ int main(int argc, char* argv[]) ...@@ -142,9 +142,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
...@@ -141,9 +141,9 @@ int main(int argc, char* argv[]) ...@@ -141,9 +141,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
...@@ -139,9 +139,9 @@ int main(int argc, char* argv[]) ...@@ -139,9 +139,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -11,9 +11,9 @@ ...@@ -11,9 +11,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -170,9 +170,9 @@ int main(int argc, char* argv[]) ...@@ -170,9 +170,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
...@@ -155,9 +155,9 @@ int main(int argc, char* argv[]) ...@@ -155,9 +155,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -165,9 +165,9 @@ int main(int argc, char* argv[]) ...@@ -165,9 +165,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1}); b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
...@@ -167,9 +167,9 @@ int main(int argc, char* argv[]) ...@@ -167,9 +167,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -11,9 +11,9 @@ ...@@ -11,9 +11,9 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -51,33 +51,34 @@ using BDataType = F16; ...@@ -51,33 +51,34 @@ using BDataType = F16;
using AccDataType = F32; using AccDataType = F32;
using CShuffleDataType = F32; using CShuffleDataType = F32;
using DDataType = F16; using DDataType = F16;
using DsDataType = ck::Tuple<DDataType>;
using EDataType = F16; using EDataType = F16;
using ALayout = Row; using ALayout = Row;
using BLayout = Col; using BLayout = Col;
using DELayout = Row; using DLayout = Row;
using ELayout = Row;
using AElementOp = PassThrough; using AElementOp = PassThrough;
using BElementOp = PassThrough; using BElementOp = PassThrough;
using CDEElementOp = AlphaBetaAdd; using CDEElementOp = AlphaBetaAdd;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using DeviceOpInstance = using DeviceOpInstance =
ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout, ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
BLayout, BLayout,
DELayout, ck::Tuple<DLayout>,
ELayout,
ADataType, ADataType,
BDataType, BDataType,
AccDataType, AccDataType,
CShuffleDataType, CShuffleDataType,
DsDataType, ck::Tuple<DDataType>,
EDataType, EDataType,
AElementOp, AElementOp,
BElementOp, BElementOp,
CDEElementOp, CDEElementOp,
GemmDefault, GemmSpec,
1, 1,
256, 256,
256, 256,
...@@ -190,9 +191,9 @@ int main(int argc, char* argv[]) ...@@ -190,9 +191,9 @@ int main(int argc, char* argv[])
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DELayout{})); Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{})); Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{})); Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
...@@ -213,10 +214,10 @@ int main(int argc, char* argv[]) ...@@ -213,10 +214,10 @@ int main(int argc, char* argv[])
d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5}); d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpace()); DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace()); DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -47,33 +47,34 @@ using BDataType = F16; ...@@ -47,33 +47,34 @@ using BDataType = F16;
using AccDataType = F32; using AccDataType = F32;
using CShuffleDataType = F16; using CShuffleDataType = F16;
using DDataType = F16; using DDataType = F16;
using DsDataType = ck::Tuple<DDataType>;
using EDataType = F16; using EDataType = F16;
using ALayout = Row; using ALayout = Row;
using BLayout = Col; using BLayout = Col;
using DLayout = Row;
using ELayout = Row; using ELayout = Row;
using AElementOp = PassThrough; using AElementOp = PassThrough;
using BElementOp = PassThrough; using BElementOp = PassThrough;
using CDEElementOp = AddRelu; using CDEElementOp = AddRelu;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using DeviceOpInstance = using DeviceOpInstance =
ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout, ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
BLayout, BLayout,
ck::Tuple<DLayout>,
ELayout, ELayout,
ADataType, ADataType,
BDataType, BDataType,
AccDataType, AccDataType,
CShuffleDataType, CShuffleDataType,
DsDataType, ck::Tuple<DDataType>,
EDataType, EDataType,
AElementOp, AElementOp,
BElementOp, BElementOp,
CDEElementOp, CDEElementOp,
GemmDefault, GemmSpec,
1, 1,
256, 256,
256, 256,
...@@ -191,10 +192,10 @@ int main(int argc, char* argv[]) ...@@ -191,10 +192,10 @@ int main(int argc, char* argv[])
d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0}); d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpace()); DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace()); DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -43,6 +43,7 @@ using ALayout = Row; ...@@ -43,6 +43,7 @@ using ALayout = Row;
using BLayout = Col; using BLayout = Col;
using D0Layout = Row; using D0Layout = Row;
using D1Layout = Row; using D1Layout = Row;
using DsLayout = ck::Tuple<D0Layout, D1Layout>;
using ELayout = Row; using ELayout = Row;
using AElementOp = PassThrough; using AElementOp = PassThrough;
...@@ -53,11 +54,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa ...@@ -53,11 +54,11 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// clang-format off // clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
//######| ALayout| BLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| //######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| //######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
// clang-format on // clang-format on
int main(int argc, char* argv[]) int main(int argc, char* argv[])
...@@ -156,11 +157,11 @@ int main(int argc, char* argv[]) ...@@ -156,11 +157,11 @@ int main(int argc, char* argv[])
d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0}); d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpace()); DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpace()); DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace()); DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
......
add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util)
# Instructions for ```example_conv_xdl_bias_relu```
## Run ```example_conv_xdl_bias_relu```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_conv_xdl_bias_relu 0 1 5
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
bias_k: dim 1, lengths {256}, strides {1}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 5 times...
Perf: 1.39009 ms, 105.581 TFlops, 239.981 GB/s
```
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
namespace {
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
// clang-format off
using DeviceConvFwdInstance = ck::tensor_operation::device::
DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
InDataType, // InDataType
WeiDataType, // WeiDataType
OutDataType, // OutDataType
AccDataType, // AccDataType
InElementOp, // InElementwiseOperation
WeiElementOp, // WeiElementwiseOperation
OutElementOp, // OutElementwiseOperation
MemorySet, // OutGlobalMemoryDataOperation
ConvFwdDefault, // ConvForwardSpecialization
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
4, // K0PerBlock
8, // K1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsAddExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_K1
true, // BBlockLdsAddExtraN
1, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on
using ReferenceConvFwdInstance =
ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
void PrintUseMsg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=n0, 1=yes)\n"
<< "Following arguments:\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int num_dim_spatial = 2;
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 4;
if(cmdline_nargs != argc)
{
PrintUseMsg();
exit(0);
}
ck::utils::conv::ConvParams params;
int arg_idx = 4;
params.num_dim_spatial_ = num_dim_spatial;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
} // anonymous namespace
int main(int argc, char* argv[])
{
using namespace ck::utils::conv;
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
const int num_dim_spatial = 2;
ck::utils::conv::ConvParams params;
if(argc >= 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
if(argc >= 5)
{
params = ParseConvParams(argc, argv);
}
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.C_)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
static_cast<std::size_t>(params.C_)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths_),
std::end(params.filter_spatial_lengths_));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K_)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
Tensor<OutDataType> host_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
Tensor<OutDataType> device_output(
get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(params.K_)})));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
std::cout << "bias: " << bias.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
bias_device_buf.ToDevice(bias.mData.data());
auto conv = DeviceConvFwdInstance{};
auto invoker = conv.MakeInvoker();
auto argument =
conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device operator with the specified compilation parameters does "
"not support this problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops(
params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype =
get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
params.C_,
params.K_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
output_spatial_lengths) +
sizeof(OutDataType) * (params.K_);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
if(do_verification)
{
auto ref_conv = ReferenceConvFwdInstance{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
bias,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(device_output.mData, host_output.mData) ? 0 : 1;
}
return 0;
}
# FIXME: should fix validation failure
add_example_executable_no_testing(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_util)
# Instructions for ```example_conv_xdl_bias_relu_add```
## Run ```example_conv_xdl_bias_relu_add```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_conv_xdl_bias_relu_add 0 1 5
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
bias_k: dim 1, lengths {256}, strides {1}
resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 5 times...
Perf: 1.44711 ms, 101.421 TFlops, 289.218 GB/s
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment