diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d2f57be30b0ca2bd0eb726f90829885ba8d613f..ef46d96f4d25d406b4eef6542ad860f249e0bfaa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
enable_testing()
set(ROCM_SYMLINK_LIBS OFF)
-find_package(ROCM 0.8 REQUIRED PATHS /opt/rocm)
+find_package(ROCM REQUIRED PATHS /opt/rocm)
include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
@@ -71,13 +71,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
endif()
message(STATUS "Build with HIP ${HIP_VERSION}")
-rocm_create_package(
- NAME composablekernel
- DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
- MAINTAINER "MIOpen Kernels Dev Team
"
- LDCONFIG
-)
-
## tidy
include(EnableCompilerWarnings)
set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
diff --git a/Dockerfile b/Dockerfile
index 0d32b52f75ac89b0138810af692d7a8177e38f0e..3d01b36c0172545ca1e4c6ca2ad6cd0f5d552980 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,6 +2,7 @@ FROM ubuntu:18.04
ARG ROCMVERSION=5.1
ARG OSDB_BKC_VERSION
+ARG compiler_version
RUN set -xe
@@ -15,7 +16,6 @@ RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.l
RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
-# ADD requirements.txt requirements.txt
# Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
apt-utils \
@@ -23,8 +23,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
cmake-data=3.15.1-0kitware1 \
cmake=3.15.1-0kitware1 \
curl \
- g++ \
- gdb \
git \
hip-rocclr \
jq \
@@ -61,17 +59,7 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
-# Install cget
-RUN pip install cget
-
-# Install rclone
-RUN pip install https://github.com/pfultz2/rclone/archive/master.tar.gz
-
ARG PREFIX=/opt/rocm
-# Install dependencies
-RUN cget install pfultz2/rocm-recipes
-# Install rbuild
-RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/6d78a0553babdaea8d2da5de15cbda7e869594b8.tar.gz
# Install packages for processing the performance results
RUN pip3 install --upgrade pip
RUN pip3 install sqlalchemy
@@ -84,12 +72,26 @@ ENV UBSAN_OPTIONS=print_stacktrace=1
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8
-ADD rbuild.ini /rbuild.ini
ADD dev-requirements.txt dev-requirements.txt
-RUN rbuild prepare -s develop -d $PREFIX
RUN groupadd -f render
# Install the new rocm-cmake version
RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git && \
cd rocm-cmake && mkdir build && cd build && \
cmake .. && cmake --build . && cmake --build . --target install
+
+WORKDIR /
+
+ENV compiler_version=$compiler_version
+RUN sh -c "echo compiler version = '$compiler_version'"
+
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ]; then \
+ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
+ cd llvm-project && mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+ make -j 8 ; \
+ else echo "using the release compiler"; \
+ fi
+
+#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
+#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
diff --git a/Jenkinsfile b/Jenkinsfile
index 15be3e540c49aef417b4f5401eb75d67d41c4465..f60507d21af201f1dbe5537cd2a8008c8c336717 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -11,6 +11,96 @@ def show_node_info() {
"""
}
+def runShell(String command){
+ def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
+ def output = readFile(file: "tmp.txt")
+ echo "tmp.txt contents: $output"
+ return (output != "")
+}
+
+def getDockerImageName(){
+ def img = "${env.MIOPEN_IMAGE_URL}:composable_kernels_${params.COMPILER_VERSION}"
+ return img
+}
+
+def getDockerImage(Map conf=[:]){
+ env.DOCKER_BUILDKIT=1
+ def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
+ def gpu_arch = conf.get("gpu_arch", "gfx908") // prebuilt dockers should have all the architectures enabled so one image can be used for all stages
+ def no_cache = conf.get("no_cache", false)
+ def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+ if(env.CCACHE_HOST)
+ {
+ def check_host = sh(script:"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+ if(check_host == "+PONG")
+ {
+ echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+ }
+ else
+ {
+ echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+ }
+ dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+ env.CCACHE_DIR = """/tmp/ccache_store"""
+ env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+ }
+ if(no_cache)
+ {
+ dockerArgs = dockerArgs + " --no-cache "
+ }
+ echo "Docker Args: ${dockerArgs}"
+ def image = getDockerImageName()
+ //Check if image exists
+ def retimage
+ try
+ {
+ echo "Pulling down image: ${image}"
+ retimage = docker.image("${image}")
+ retimage.pull()
+ }
+ catch(Exception ex)
+ {
+ error "Unable to locate image: ${image}"
+ }
+ return [retimage, image]
+}
+
+def buildDocker(install_prefix){
+ show_node_info()
+ env.DOCKER_BUILDKIT=1
+ checkout scm
+ def image_name = getDockerImageName()
+ echo "Building Docker for ${image_name}"
+ def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+ if(env.CCACHE_HOST)
+ {
+ def check_host = sh(script:"""(printf "PING\\r\\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
+ if(check_host == "+PONG")
+ {
+ echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
+ }
+ else
+ {
+ echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
+ }
+ dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
+ env.CCACHE_DIR = """/tmp/ccache_store"""
+ env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
+ }
+
+ echo "Build Args: ${dockerArgs}"
+ try{
+ echo "Checking for image: ${image_name}"
+ sh "docker manifest inspect --insecure ${image_name}"
+ echo "Image: ${image_name} found!! Skipping building image"
+ }
+ catch(Exception ex){
+ echo "Unable to locate image: ${image_name}. Building image now"
+ retimage = docker.build("${image_name}", dockerArgs + ' .')
+ retimage.push()
+ }
+}
+
def cmake_build(Map conf=[:]){
def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
@@ -60,7 +150,7 @@ def cmake_build(Map conf=[:]){
"""
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
// reduce parallelism when compiling, clang uses too much memory
- def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 1 )) ${config_targets}")
+ def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 2 )) ${config_targets}")
def execute_cmd = conf.get("execute_cmd", "")
def cmd = conf.get("cmd", """
@@ -85,7 +175,7 @@ def buildHipClangJob(Map conf=[:]){
env.HSA_ENABLE_SDMA=0
checkout scm
- def image = "composable_kernels"
+ def image = "composable_kernels_${params.COMPILER_VERSION}"
def prefixpath = conf.get("prefixpath", "/opt/rocm")
def gpu_arch = conf.get("gpu_arch", "gfx908")
@@ -93,51 +183,56 @@ def buildHipClangJob(Map conf=[:]){
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) {
- dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
+ }
+ //def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
+ def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+ if (params.COMPILER_VERSION != "release"){
+ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
- def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
def variant = env.STAGE_NAME
def retimage
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
- if (params.USE_DOCKERFILE){
- try {
- retimage = docker.build("${image}", dockerArgs + '.')
- withDockerContainer(image: image, args: dockerOpts) {
- timeout(time: 5, unit: 'MINUTES')
- {
- sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+ try {
+ //retimage = docker.build("${image}", dockerArgs + '.')
+ (retimage, image) = getDockerImage(conf)
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES'){
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+ throw new Exception ("GPU not found")
}
- }
- }
- catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
- echo "The job was cancelled or aborted"
- throw e
- }
- catch(Exception ex) {
- retimage = docker.build("${image}", dockerArgs + "--no-cache .")
- withDockerContainer(image: image, args: dockerOpts) {
- timeout(time: 5, unit: 'MINUTES')
- {
- sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+ else{
+ echo "GPU is OK"
}
}
}
}
- else{
- timeout(time: 3, unit: 'HOURS'){
- retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
- image="b56f8ac0d6ea"
- sh "docker images"
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ catch(Exception ex) {
+ retimage = docker.build("${image}", dockerArgs + " --no-cache .")
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES'){
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
+ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+ throw new Exception ("GPU not found")
+ }
+ else{
+ echo "GPU is OK"
+ }
+ }
}
}
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 5, unit: 'HOURS')
{
- sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
cmake_build(conf)
}
}
@@ -149,10 +244,6 @@ def reboot(){
build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
}
-
-
-
-
def buildHipClangJobAndReboot(Map conf=[:]){
try{
buildHipClangJob(conf)
@@ -169,14 +260,14 @@ def buildHipClangJobAndReboot(Map conf=[:]){
}
}
-
def runCKProfiler(Map conf=[:]){
show_node_info()
env.HSA_ENABLE_SDMA=0
checkout scm
- def image = "composable_kernels"
+
+ def image = "composable_kernels_${params.COMPILER_VERSION}"
def prefixpath = conf.get("prefixpath", "/opt/rocm")
def gpu_arch = conf.get("gpu_arch", "gfx908")
@@ -184,98 +275,92 @@ def runCKProfiler(Map conf=[:]){
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) {
- dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
+ }
+ def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
+ if (params.COMPILER_VERSION != "release"){
+ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
- def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' "
def variant = env.STAGE_NAME
-
def retimage
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
- if (params.USE_DOCKERFILE){
- try {
- retimage = docker.build("${image}", dockerArgs + '.')
- withDockerContainer(image: image, args: dockerOpts) {
- timeout(time: 5, unit: 'MINUTES')
- {
- sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+ try {
+ //retimage = docker.build("${image}", dockerArgs + '.')
+ (retimage, image) = getDockerImage(conf)
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES'){
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+ throw new Exception ("GPU not found")
}
- }
- }
- catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
- echo "The job was cancelled or aborted"
- throw e
- }
- catch(Exception ex) {
- retimage = docker.build("${image}", dockerArgs + "--no-cache .")
- withDockerContainer(image: image, args: dockerOpts) {
- timeout(time: 5, unit: 'MINUTES')
- {
- sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+ else{
+ echo "GPU is OK"
}
}
}
}
- else{
- timeout(time: 3, unit: 'HOURS'){
- retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
- image="b56f8ac0d6ea"
- sh "docker images"
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ catch(Exception ex) {
+ retimage = docker.build("${image}", dockerArgs + " --no-cache .")
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES'){
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+ throw new Exception ("GPU not found")
+ }
+ else{
+ echo "GPU is OK"
+ }
+ }
}
}
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
- timeout(time: 5, unit: 'HOURS')
+ timeout(time: 24, unit: 'HOURS')
{
cmake_build(conf)
dir("script"){
- //run gemm performance tests
- def gemm_log = "perf_gemm_${gpu_arch}.log"
- sh "rm -f ${gemm_log}"
- sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
- sh "echo Node name: ${NODE_NAME} >> ${gemm_log}"
- sh "echo GPU_arch name: ${gpu_arch} >> ${gemm_log}"
- sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
- sh "hipcc --version | grep -e 'HIP version' >> ${gemm_log}"
- sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
- sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}"
- sh "./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}"
- //results will be parsed, stored, and analyzed within the python script
- //the script will return 0 if the performance criteria are met
- //or return 1 if the criteria are not met
- archiveArtifacts "${gemm_log}"
- sh "python3 parse_perf_data.py ${gemm_log} "
- //run resnet50 test
- def resnet_log = "perf_resnet50_${gpu_arch}.log"
- sh "rm -f ${resnet_log}"
- sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}"
- sh "echo Node name: ${NODE_NAME} >> ${resnet_log}"
- sh "echo GPU_arch name: ${gpu_arch} >> ${resnet_log}"
- sh "rocminfo | grep 'Compute Unit:' >> ${resnet_log} "
- sh "hipcc --version | grep -e 'HIP version' >> ${resnet_log}"
- sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}"
- //first run tests with N=256
- sh "./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}"
- //then run with N=4
- sh "./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}"
- archiveArtifacts "${resnet_log}"
- //the script will put the results from N=256 and N=4 runs into separate tables
- sh "python3 parse_perf_data.py ${resnet_log} "
+ if (params.RUN_FULL_QA){
+ def qa_log = "qa_${gpu_arch}.log"
+ sh "./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+ archiveArtifacts "perf_gemm_${gpu_arch}.log"
+ archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
+ archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
+ archiveArtifacts "perf_batched_gemm_${gpu_arch}.log"
+ archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
+ archiveArtifacts "perf_conv_fwd_${gpu_arch}.log"
+ archiveArtifacts "perf_conv_bwd_data_${gpu_arch}.log"
+ archiveArtifacts "perf_gemm_bilinear_${gpu_arch}.log"
+ archiveArtifacts "perf_reduction_${gpu_arch}.log"
+ // stash perf files to master
+ stash name: "perf_gemm_${gpu_arch}.log"
+ stash name: "perf_resnet50_N256_${gpu_arch}.log"
+ stash name: "perf_resnet50_N4_${gpu_arch}.log"
+ stash name: "perf_batched_gemm_${gpu_arch}.log"
+ stash name: "perf_grouped_gemm_${gpu_arch}.log"
+ stash name: "perf_conv_fwd_${gpu_arch}.log"
+ stash name: "perf_conv_bwd_data_${gpu_arch}.log"
+ stash name: "perf_gemm_bilinear_${gpu_arch}.log"
+ stash name: "perf_reduction_${gpu_arch}.log"
+ //we will process results on the master node
+ }
+ else{
+ sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+ archiveArtifacts "perf_gemm_${gpu_arch}.log"
+ archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
+ archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
+ // stash perf files to master
+ stash name: "perf_gemm_${gpu_arch}.log"
+ stash name: "perf_resnet50_N256_${gpu_arch}.log"
+ stash name: "perf_resnet50_N4_${gpu_arch}.log"
+ //we will process the results on the master node
+ }
+
}
}
}
@@ -283,7 +368,6 @@ def runCKProfiler(Map conf=[:]){
return retimage
}
-
def runPerfTest(Map conf=[:]){
try{
runCKProfiler(conf)
@@ -300,16 +384,97 @@ def runPerfTest(Map conf=[:]){
}
}
+def process_results(Map conf=[:]){
+ env.HSA_ENABLE_SDMA=0
+ checkout scm
+ def image = "composable_kernels_${params.COMPILER_VERSION}"
+ def prefixpath = "/opt/rocm"
+ def gpu_arch = conf.get("gpu_arch", "gfx908")
+
+ // Jenkins is complaining about the render group
+ def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ if (conf.get("enforce_xnack_on", false)) {
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
+ }
+ def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='release' "
+
+ def variant = env.STAGE_NAME
+ def retimage
+
+ gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+ try {
+ //retimage = docker.build("${image}", dockerArgs + '.')
+ (retimage, image) = getDockerImage(conf)
+ }
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ }
+
+ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+ timeout(time: 1, unit: 'HOURS'){
+ try{
+ dir("script"){
+ if (params.RUN_FULL_QA){
+ // unstash perf files to master
+ unstash "perf_gemm_${gpu_arch}.log"
+ unstash "perf_resnet50_N256_${gpu_arch}.log"
+ unstash "perf_resnet50_N4_${gpu_arch}.log"
+ unstash "perf_batched_gemm_${gpu_arch}.log"
+ unstash "perf_grouped_gemm_${gpu_arch}.log"
+ unstash "perf_conv_fwd_${gpu_arch}.log"
+ unstash "perf_conv_bwd_data_${gpu_arch}.log"
+ unstash "perf_gemm_bilinear_${gpu_arch}.log"
+ unstash "perf_reduction_${gpu_arch}.log"
+ sh "./process_qa_data.sh ${gpu_arch}"
+ }
+ else{
+ // unstash perf files to master
+ unstash "perf_gemm_${gpu_arch}.log"
+ unstash "perf_resnet50_N256_${gpu_arch}.log"
+ unstash "perf_resnet50_N4_${gpu_arch}.log"
+ sh "./process_perf_data.sh ${gpu_arch}"
+ }
+ }
+ }
+ catch(e){
+ echo "throwing error exception while processing performance test results"
+ echo 'Exception occurred: ' + e.toString()
+ throw e
+ }
+ }
+ }
+}
+
+//launch develop branch daily at 23:00 in FULL_QA mode
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true''' : ""
+
pipeline {
agent none
+ triggers {
+ parameterizedCron(CRON_SETTINGS)
+ }
options {
parallelsAlwaysFailFast()
}
parameters {
booleanParam(
- name: "USE_DOCKERFILE",
+ name: "BUILD_DOCKER",
defaultValue: true,
- description: "")
+ description: "Force building docker image (default: true)")
+ string(
+ name: 'COMPILER_VERSION',
+ defaultValue: 'ck-9110',
+ description: 'Specify which version of compiler to use: ck-9110 (default), release, or amd-stg-open.')
+ booleanParam(
+ name: "RUN_FULL_QA",
+ defaultValue: false,
+ description: "Select whether to run small set of performance tests (default) or full QA")
+ booleanParam(
+ name: "TEST_NODE_PERFORMANCE",
+ defaultValue: false,
+ description: "Test the node GPU performance (default: false)")
}
environment{
dbuser = "${dbuser}"
@@ -319,9 +484,28 @@ pipeline {
dbsshuser = "${dbsshuser}"
dbsshpassword = "${dbsshpassword}"
status_wrapper_creds = "${status_wrapper_creds}"
+ gerrit_cred="${gerrit_cred}"
+ DOCKER_BUILDKIT = "1"
}
stages{
+ stage("Build Docker"){
+ when {
+ expression { params.BUILD_DOCKER.toBoolean() }
+ }
+ parallel{
+ stage('Docker /opt/rocm'){
+ agent{ label rocmnode("nogpu") }
+ steps{
+ buildDocker('/opt/rocm')
+ }
+ }
+ }
+ }
stage("Static checks") {
+ when {
+ beforeAgent true
+ expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
+ }
parallel{
// enable after we move from hipcc to hip-clang
// stage('Tidy') {
@@ -355,6 +539,10 @@ pipeline {
}
stage("Tests")
{
+ when {
+ beforeAgent true
+ expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
+ }
parallel
{
stage("Run Tests: gfx908")
@@ -369,6 +557,10 @@ pipeline {
}
stage("Run Tests: gfx90a")
{
+ when {
+ beforeAgent true
+ expression { params.RUN_FULL_QA.toBoolean() }
+ }
agent{ label rocmnode("gfx90a")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
@@ -381,6 +573,10 @@ pipeline {
}
stage("Client App")
{
+ when {
+ beforeAgent true
+ expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
+ }
parallel
{
stage("Run Client App")
@@ -402,6 +598,10 @@ pipeline {
{
stage("Run ckProfiler: gfx908")
{
+ when {
+ beforeAgent true
+ expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
+ }
agent{ label rocmnode("gfx908")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
@@ -412,6 +612,10 @@ pipeline {
}
stage("Run ckProfiler: gfx90a")
{
+ when {
+ beforeAgent true
+ expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
+ }
agent{ label rocmnode("gfx90a")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
@@ -422,6 +626,33 @@ pipeline {
}
}
}
+ stage("Process Performance Test Results")
+ {
+ parallel
+ {
+ stage("Process results for gfx908"){
+ when {
+ beforeAgent true
+ expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
+ }
+ agent { label 'mici' }
+ steps{
+ process_results(gpu_arch: "gfx908")
+ }
+ }
+ stage("Process results for gfx90a"){
+ when {
+ beforeAgent true
+ expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
+ }
+ agent { label 'mici' }
+ steps{
+ process_results(gpu_arch: "gfx90a")
+ }
+ }
+ }
+ }
+
/* enable after the cmake file supports packaging
stage("Packages") {
when {
diff --git a/README.md b/README.md
index 5f9f95859b3027f5d132fc8ce653ef6944b53190..bbc4d2bc30a297d7e58a7b32d4dafb28bbc738d3 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ rocm/tensorflow:rocm5.1-tf2.6-dev \
/bin/bash
```
-# Install the new rocm-cmake version
+# Install newer version of rocm-cmake
https://github.com/RadeonOpenCompute/rocm-cmake
## Build
@@ -26,6 +26,7 @@ cmake \
-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
+-D CMAKE_INSTALL_PREFIX=${PATH_TO_CK_INSTALL_DIRECTORY} \
..
```
@@ -47,6 +48,13 @@ Instructions for running each individual examples are under ```example/```
```
Instructions for running ckProfiler are under ```profiler/```
+## Install CK
+```bash
+make install
+```
+
+## Using CK as pre-built kernel library
+Instructions for using CK as a pre-built kernel library are under ```client_example/```
## Caveat
### Kernel Timing and Verification
diff --git a/client_example/01_gemm/CMakeLists.txt b/client_example/01_gemm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9e741192f90b8216e4b3abe32ae8971fb45ddfee
--- /dev/null
+++ b/client_example/01_gemm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_gemm gemm.cpp)
+target_link_libraries(client_gemm PRIVATE composable_kernel::device_operations)
diff --git a/client_example/01_gemm/gemm.cpp b/client_example/01_gemm/gemm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8a6bf16c2b09a3204b119be63258f9ca213336f
--- /dev/null
+++ b/client_example/01_gemm/gemm.cpp
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+using ADataType = F16;
+using BDataType = F16;
+using CDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideC = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 7)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideC = std::stoi(argv[6]);
+ }
+ else
+ {
+ printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+
+ using DeviceOp =
+ ck::tensor_operation::device::DeviceGemm;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto c_element_op = CElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
index bdd6e05029f4563cee6145e1bd1d8d09957d562f..f88e72b62e4a20beeda8b1d5432a1055c32e2627 100644
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/device_gemm_add_add_fastgelu_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
using F16 = ck::half_t;
using F32 = float;
@@ -25,12 +25,11 @@ using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = AddAddFastGelu;
-using ADataType = F16;
-using BDataType = F16;
-using AccDataType = F32;
-using D0DataType = F16;
-using D1DataType = F16;
-using EDataType = F16;
+using ADataType = F16;
+using BDataType = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using EDataType = F16;
using ALayout = Row;
using BLayout = Col;
@@ -111,19 +110,22 @@ int main(int argc, char* argv[])
f_matrix_space_size(M, N, StrideD1, D1Layout{}));
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
- // add device op instances
- const auto op_ptrs = ck::tensor_operation::device::device_gemm_instance::
- get_device_gemm_add_add_fastgelu_instances();
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::AddAddFastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
@@ -231,6 +233,8 @@ int main(int argc, char* argv[])
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
+
+ std::cout << "Done" << std::endl;
}
return 0;
diff --git a/client_example/03_gemm_layernorm/CMakeLists.txt b/client_example/03_gemm_layernorm/CMakeLists.txt
index 8eeaffc0085ebdf29b0467f0dfed39fcd439bf2d..3742e70844b96575e263b22a14b0bb8c4cde7a43 100644
--- a/client_example/03_gemm_layernorm/CMakeLists.txt
+++ b/client_example/03_gemm_layernorm/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_executable(gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp)
-target_link_libraries(gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations)
+add_executable(client_gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp)
+target_link_libraries(client_gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations)
diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
index bc47a3929a26aa7fba2f21e378b6e3de36e3bf59..8f142937281a712d1004e15a578fc64d6501d473 100644
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
@@ -160,16 +160,17 @@ int main()
ck::index_t StrideC = 1024;
ck::index_t StrideD0 = 1024;
- const auto gemm_reduce_ptrs = ck::tensor_operation::device::device_gemm_instance::
- get_device_gemm_add_add_mean_squaremean_instances();
+ const auto gemm_reduce_ptrs =
+ ck::tensor_operation::device::instance::get_device_gemm_add_add_mean_squaremean_instances<
+ ADataType,
+ BDataType,
+ CDataType,
+ ALayout,
+ BLayout,
+ CLayout>();
const auto normalize_ptrs =
- ck::tensor_operation::device::get_device_normalize_from_mean_meansquare_instances<
+ ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
CDataType,
ReduceDataType,
ReduceDataType,
@@ -267,4 +268,4 @@ int main()
<< std::endl;
}
}
-}
\ No newline at end of file
+}
diff --git a/client_example/04_contraction/CMakeLists.txt b/client_example/04_contraction/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4bc6780f96d2fe4a4912e3c188b4b5155cc162dd
--- /dev/null
+++ b/client_example/04_contraction/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_executable(client_contraction_scale contraction_scale.cpp)
+target_link_libraries(client_contraction_scale PRIVATE composable_kernel::device_operations)
+
+add_executable(client_contraction_bilinear contraction_bilinear.cpp)
+target_link_libraries(client_contraction_bilinear PRIVATE composable_kernel::device_operations)
+
diff --git a/client_example/04_contraction/contraction_bilinear.cpp b/client_example/04_contraction/contraction_bilinear.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b71c51c02620ce62257e3b33a6165a1c8ddda2b1
--- /dev/null
+++ b/client_example/04_contraction/contraction_bilinear.cpp
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = Bilinear;
+
+using ADataType = F32;
+using BDataType = F32;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using DDataType = F32;
+using DsDataType = ck::Tuple;
+using EDataType = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+
+ float alpha = 1.f;
+ float beta = 1.f;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 25)
+ {
+ const ck::index_t M0 = std::stoi(argv[1]);
+ const ck::index_t M1 = std::stoi(argv[2]);
+
+ const ck::index_t N0 = std::stoi(argv[3]);
+ const ck::index_t N1 = std::stoi(argv[4]);
+
+ const ck::index_t K0 = std::stoi(argv[5]);
+ const ck::index_t K1 = std::stoi(argv[6]);
+
+ a_ms_ks_lengths = {M0, M1, K0, K1};
+ a_ms_ks_strides = {
+ std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+ b_ns_ks_lengths = {N0, N1, K0, K1};
+ b_ns_ks_strides = {
+ std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+ d_ms_ns_lengths = {M0, M1, N0, N1};
+ d_ms_ns_strides = {
+ std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+ e_ms_ns_lengths = {M0, M1, N0, N1};
+ e_ms_ns_strides = {
+ std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
+
+ alpha = std::stof(argv[23]);
+ beta = std::stof(argv[24]);
+ }
+ else
+ {
+ printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+ printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+ printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+ printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+ printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+ printf("arg23 to 24: alpha, beta\n");
+ exit(0);
+ }
+
+ auto f_tensor_space_size = [](auto lengths, auto strides) {
+ std::size_t space_size = 1;
+ for(std::size_t i = 0; i < lengths.size(); ++i)
+ {
+ space_size += (lengths[i] - 1) * strides[i];
+ }
+ return space_size;
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+ f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+ f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+ SimpleDeviceMem d_device_buf(sizeof(DDataType) *
+ f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+ f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+ NumDimM,
+ NumDimN,
+ NumDimK,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::Bilinear>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{alpha, beta};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr =
+ op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ a_ms_ks_lengths,
+ a_ms_ks_strides,
+ b_ns_ks_lengths,
+ b_ns_ks_strides,
+ std::array, 1>{d_ms_ns_lengths},
+ std::array, 1>{d_ms_ns_strides},
+ e_ms_ns_lengths,
+ e_ms_ns_strides,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
+ e_ms_ns_lengths.begin() + NumDimM,
+ ck::index_t{1},
+ std::multiplies{});
+
+ ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
+ e_ms_ns_lengths.begin() + NumDimM + NumDimN,
+ ck::index_t{1},
+ std::multiplies{});
+
+ ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
+ a_ms_ks_lengths.begin() + NumDimM + NumDimK,
+ ck::index_t{1},
+ std::multiplies{});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+ sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ return 0;
+}
diff --git a/client_example/04_contraction/contraction_scale.cpp b/client_example/04_contraction/contraction_scale.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5908c1d86e678796dec3d2616c83e9fca40595fb
--- /dev/null
+++ b/client_example/04_contraction/contraction_scale.cpp
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale = ck::tensor_operation::element_wise::Scale;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = Scale;
+
+using ADataType = F32;
+using BDataType = F32;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using DsDataType = ck::Tuple<>;
+using EDataType = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+
+ float scale = 1.f;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 20)
+ {
+ const ck::index_t M0 = std::stoi(argv[1]);
+ const ck::index_t M1 = std::stoi(argv[2]);
+
+ const ck::index_t N0 = std::stoi(argv[3]);
+ const ck::index_t N1 = std::stoi(argv[4]);
+
+ const ck::index_t K0 = std::stoi(argv[5]);
+ const ck::index_t K1 = std::stoi(argv[6]);
+
+ a_ms_ks_lengths = {M0, M1, K0, K1};
+ a_ms_ks_strides = {
+ std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+ b_ns_ks_lengths = {N0, N1, K0, K1};
+ b_ns_ks_strides = {
+ std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+ e_ms_ns_lengths = {M0, M1, N0, N1};
+ e_ms_ns_strides = {
+ std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+ scale = std::stof(argv[19]);
+ }
+ else
+ {
+ printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+ printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+ printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+ printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+ printf("arg19: scale\n");
+ exit(0);
+ }
+
+ auto f_tensor_space_size = [](auto lengths, auto strides) {
+ std::size_t space_size = 1;
+ for(std::size_t i = 0; i < lengths.size(); ++i)
+ {
+ space_size += (lengths[i] - 1) * strides[i];
+ }
+ return space_size;
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+ f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+ f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+ f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+ NumDimM,
+ NumDimN,
+ NumDimK,
+ ADataType,
+ BDataType,
+ ck::Tuple<>,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::Scale>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{scale};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{},
+ e_device_buf.GetDeviceBuffer(),
+ a_ms_ks_lengths,
+ a_ms_ks_strides,
+ b_ns_ks_lengths,
+ b_ns_ks_strides,
+ std::array, 0>{},
+ std::array, 0>{},
+ e_ms_ns_lengths,
+ e_ms_ns_strides,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(),
+ e_ms_ns_lengths.begin() + NumDimM,
+ ck::index_t{1},
+ std::multiplies{});
+
+ ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM,
+ e_ms_ns_lengths.begin() + NumDimM + NumDimN,
+ ck::index_t{1},
+ std::multiplies{});
+
+ ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM,
+ a_ms_ks_lengths.begin() + NumDimM + NumDimK,
+ ck::index_t{1},
+ std::multiplies{});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ return 0;
+}
diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt
index a8a566703b9b954cf93c0186e6959af9c6b84ee7..3e04a18599a7b488fb306cbaf598494bd48b69d5 100644
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -6,5 +6,7 @@ find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
find_package(hip REQUIRED PATHS /opt/rocm)
message(STATUS "Build with HIP ${hip_VERSION}")
+add_subdirectory(01_gemm)
add_subdirectory(02_gemm_add_add_fastgelu)
add_subdirectory(03_gemm_layernorm)
+add_subdirectory(04_contraction)
diff --git a/client_example/README.md b/client_example/README.md
index dc6b9c48fca6305a78a0767ec426285d027f963d..64a7130d537b1e2fb8752c4031e8430d11a6a46a 100644
--- a/client_example/README.md
+++ b/client_example/README.md
@@ -1,17 +1,6 @@
##
Client application links to CK library, and therefore CK library needs to be installed before building client applications.
-## Docker script
-```bash
-docker run \
--it \
---privileged \
---group-add sudo \
--w /root/workspace \
--v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
-rocm/tensorflow:rocm5.1-tf2.6-dev \
-/bin/bash
-```
## Build
```bash
@@ -22,7 +11,7 @@ cd client_example/build
```bash
cmake \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
--D CMAKE_PREFIX_PATH=/opt/rocm \
+-D CMAKE_PREFIX_PATH="/opt/rocm;${PATH_TO_CK_INSTALL_DIRECTORY}" \
..
```
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index 3718b916ffe43996852507881db281dc5647fef0..cf2240ebc52f1707cfc7a6cc2e6f9414aa9cb8e0 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -20,6 +20,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
-Wno-unused-member-function
-Wno-comma
-Wno-old-style-cast
+ -Wno-deprecated
)
message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index c03c454c68eafe8aa6a9040648f3ac7aa846f3b9..fc22088ad4f61d4e5cc4bb8ae880688578bc46bf 100644
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -4,5 +4,6 @@ add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
+add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/example/01_gemm/gemm_dl_fp16.cpp
index 0a3060fdc71b22cd655634c7b5d01b00363dffee..e4bd3906c27ceddced35be201b9bfc0b45761a12 100644
--- a/example/01_gemm/gemm_dl_fp16.cpp
+++ b/example/01_gemm/gemm_dl_fp16.cpp
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
@@ -142,9 +142,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
}
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/example/01_gemm/gemm_dl_fp32.cpp
index d9677da9b9fd6aa2578cb20b3176e5c5d45b0ffd..0b5d5b6de108920e61c34a41a3dd5f3d6ad89fd6 100644
--- a/example/01_gemm/gemm_dl_fp32.cpp
+++ b/example/01_gemm/gemm_dl_fp32.cpp
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
@@ -141,9 +141,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
}
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
diff --git a/example/01_gemm/gemm_dl_int8.cpp b/example/01_gemm/gemm_dl_int8.cpp
index 65206d602f66eb800c783bace5a784fadee0c86a..77871105801c3b7e4af8bb472dd6addbf7ee7776 100644
--- a/example/01_gemm/gemm_dl_int8.cpp
+++ b/example/01_gemm/gemm_dl_int8.cpp
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
@@ -139,9 +139,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
}
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
index 0575c0bd9e2fa89a5f8823d7a7796d3d75a50ffd..f1a2448025b1210786a2925605350d406d5ed50a 100644
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -11,9 +11,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
@@ -170,9 +170,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
}
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index 1a603ff995e8822326d132abf8cf5d99a4930036..052d28994fa68158fa6936614980266d686926c2 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -9,13 +9,15 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
@@ -47,22 +49,29 @@ using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
-using DeviceGemmInstance0 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
-//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
-//######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
-//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- < Row, Col, Row, F16, F16, F16, F32, F32, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
-
-using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_WaveletModel_CShuffle
+using DeviceGemmInstance0 = ck::tensor_operation::device::DeviceGemmXdl
+//######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+//######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
+//######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
+//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 256, 128, 4, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>;
+
+using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
+//######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
+//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
+
+using DeviceGemmInstance2 = ck::tensor_operation::device::DeviceGemm_Xdl_WaveletModel_CShuffle
//######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| ABBlockTransfer| BlockGemm| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| ThreadGroupSize| ThreadGroupSize| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
-//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- < Row, Col, Row, F16, F16, F16, F32, F16, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1,8>, 8>;
+//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < Row, Col, Row, F16, F16, F16, F32, F16, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1,8>, 8>;
// clang-format on
-using DeviceGemmInstance = DeviceGemmInstance1;
+using DeviceGemmInstance = DeviceGemmInstance2;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm;
@@ -115,6 +124,14 @@ int main(int argc, char* argv[])
exit(0);
}
+ const ck::index_t PackedStrideA = ck::is_same_v ? K : M;
+ const ck::index_t PackedStrideB = ck::is_same_v ? N : K;
+ const ck::index_t PackedStrideC = ck::is_same_v ? N : M;
+
+ StrideA = (StrideA < 0) ? PackedStrideA : StrideA;
+ StrideB = (StrideB < 0) ? PackedStrideB : StrideB;
+ StrideC = (StrideC < 0) ? PackedStrideC : StrideC;
+
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same::value)
@@ -154,9 +171,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
}
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
@@ -166,8 +183,9 @@ int main(int argc, char* argv[])
auto c_element_op = CElementOp{};
// do GEMM
- // replace DeviceGemmInstance_WaveletModel for wavelet gemm pipeline
- // auto gemm = DeviceGemmInstance_WaveletModel{};
+ // replace DeviceGemmInstance_WaveletModel for
+ // wavelet gemm pipeline auto gemm =
+ // DeviceGemmInstance_WaveletModel{};
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()),
diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 1b222c971267102dbd3cbd7465aaf82009d6ecd9..82e2f99b983b3c7e8794fe1e4550ddea09461a8c 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
@@ -165,9 +165,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_1{1});
}
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
index 4ed1f177db6d0e5df668256f232d631ca9f2464a..ca5c66f8af1db0cd37650aaf42d13fbedff677ae 100644
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -13,9 +13,9 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
@@ -167,9 +167,9 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
}
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
diff --git a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c709d30cfd531b21be3c02b44e57b0d7be386e33
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -0,0 +1,260 @@
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_xdl_skip_b_lds.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template
+using S = ck::Sequence;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+#define USING_SKIP_LDS 1
+
+// clang-format off
+#if USING_SKIP_LDS
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSkipBLds
+ //###########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BThreadTransfer| BBlock| CThreadTransfer| CThreadTransfer|
+ //###########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| SrcScalar| buffer| SrcDstVectorDim| DstScalar|
+ //###########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerVector| size | | PerVector|
+ //###########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+#if 0
+ < F16, F16, F16, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 64, 4, 8, 16, 16, 1, 1, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 8, 8, 7, 1>;
+using ADataType = ck::half_t;
+using BDataType = ck::half_t;
+using CDataType = ck::half_t;
+using AccDataType = float;
+#else
+ < F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 64, 4, 4, 16, 16, 1, 1, S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 4, 4, 7, 1>;
+using ADataType = float;
+using BDataType = float;
+using CDataType = float;
+using AccDataType = float;
+#endif
+
+#else
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
+ //###########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+ //###########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
+ //###########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
+ //###########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 16, 64, 4, 4, 16, 16, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 7, 1, 2>;
+using ADataType = float;
+using BDataType = float;
+using CDataType = float;
+using AccDataType = float;
+
+#endif
+ // clang-format on
+
+ using ReferenceGemmInstance = ck::tensor_operation::host::
+ ReferenceGemm;
+
+template
+std::ostream& show_2d_matrix(std::ostream& os, Tensor& matrix)
+{
+ os << "[" << std::endl;
+ for(size_t x = 0; x < matrix.mDesc.GetLengths()[0]; x++)
+ {
+ os << "[";
+ for(size_t y = 0; y < matrix.mDesc.GetLengths()[1]; y++)
+ {
+ os << std::setw(5) << static_cast(matrix(x, y));
+ }
+ os << "]" << std::endl;
+ }
+ os << "]";
+ return os;
+}
+int main(int argc, char* argv[])
+{
+ bool do_verification = 0;
+ int init_method = 0;
+ bool time_kernel = false;
+
+ // GEMM shape
+#if 1
+ ck::index_t M = 16;
+ ck::index_t N = 64 * 120;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = K;
+ ck::index_t StrideB = K;
+ ck::index_t StrideC = N;
+#else
+ ck::index_t M = 16;
+ ck::index_t N = 16;
+ ck::index_t K = 32;
+
+ ck::index_t StrideA = 8;
+ ck::index_t StrideB = 8;
+ ck::index_t StrideC = 16;
+#endif
+
+ if(argc == 4)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+ }
+ else if(argc == 10)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+
+ M = std::stoi(argv[4]);
+ N = std::stoi(argv[5]);
+ K = std::stoi(argv[6]);
+
+ StrideA = std::stoi(argv[7]);
+ StrideB = std::stoi(argv[8]);
+ StrideC = std::stoi(argv[9]);
+ }
+ else
+ {
+ printf("arg1: verification (0=no, 1=yes)\n");
+ printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+ printf("arg3: time kernel (0=n0, 1=yes)\n");
+ printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+ exit(0);
+ }
+
+ auto f_host_tensor_descriptor =
+ [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+ if(std::is_same::value)
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({stride, 1}));
+ }
+ else
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({1, stride}));
+ }
+ };
+
+ Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+ Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+ Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+ Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+ std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+ std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+ std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+ switch(init_method)
+ {
+ case 0: break;
+ case 1:
+ a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ break;
+ case 2:
+ a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
+ break;
+ default:
+ // a_m_k.GenerateTensorValue(GeneratorTensor_1{1});
+ a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ b_k_n.GenerateTensorValue(GeneratorTensor_1{1});
+ }
+
+ DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+ a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+ b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+ auto a_element_op = AElementOp{};
+ auto b_element_op = BElementOp{};
+ auto c_element_op = CElementOp{};
+
+ // do GEMM
+ auto gemm = DeviceGemmInstance{};
+ auto invoker = gemm.MakeInvoker();
+ auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()),
+ static_cast(b_k_n_device_buf.GetDeviceBuffer()),
+ static_cast(c_m_n_device_buf.GetDeviceBuffer()),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ if(!gemm.IsSupportedArgument(argument))
+ {
+ throw std::runtime_error(
+ "wrong! device_gemm with the specified compilation parameters does "
+ "not support this GEMM problem");
+ }
+
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+ << gemm.GetTypeString() << std::endl;
+
+ c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+ if(do_verification)
+ {
+ auto ref_gemm = ReferenceGemmInstance{};
+ auto ref_invoker = ref_gemm.MakeInvoker();
+
+ auto ref_argument = ref_gemm.MakeArgument(
+ a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+ ref_invoker.Run(ref_argument);
+
+#if 0
+ {
+ show_2d_matrix(std::cout << "a : ", a_m_k) << std::endl;
+ show_2d_matrix(std::cout << "b: ", b_k_n) << std::endl;
+ show_2d_matrix(std::cout << "c_device: ", c_m_n_device_result) << std::endl;
+ show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
+ }
+#endif
+ ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+ }
+
+ return 0;
+}
diff --git a/example/02_gemm_alpha_beta/CMakeLists.txt b/example/02_gemm_alpha_beta/CMakeLists.txt
deleted file mode 100644
index 1b81cf21622b6e70cb43dbd4bc90874fc7bf5580..0000000000000000000000000000000000000000
--- a/example/02_gemm_alpha_beta/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_example_executable(example_gemm_xdl_alpha_beta gemm_xdl_alpha_beta.cpp)
diff --git a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp b/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
deleted file mode 100644
index ac56323f722f8dbd9a2d5772c4abe21212830820..0000000000000000000000000000000000000000
--- a/example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include
-#include
-#include
-#include
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp"
-
-template
-using S = ck::Sequence;
-
-using ADataType = ck::half_t;
-using BDataType = ck::half_t;
-using CDataType = ck::half_t;
-using AccDataType = float;
-
-using ALayout = ck::tensor_layout::gemm::RowMajor;
-using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-using CLayout = ck::tensor_layout::gemm::RowMajor;
-
-using AElementOp = ck::tensor_operation::element_wise::PassThrough;
-using BElementOp = ck::tensor_operation::element_wise::PassThrough;
-using CElementOp = ck::tensor_operation::element_wise::AlphaBetaAdd;
-
-// clang-format off
-using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle_Bias_2d<
- ADataType, // ADataType
- BDataType, // BDataType
- CDataType, // CDataType
- AccDataType, // AccDataType
- ALayout, // ALayout
- BLayout, // BLayout
- CLayout, // CLayout
- AElementOp, // AElementwiseOperation
- BElementOp, // BElementwiseOperation
- CElementOp, // CElementwiseOperation
- 256, // BlockSize
- 256, // MPerBlock
- 128, // NPerBlock
- 4, // K0PerBlock
- 8, // K1
- 32, // MPerXDL
- 32, // NPerXDL
- 4, // MXdlPerWave
- 2, // NXdlPerWave
- S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
- S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
- S<1, 0, 2>, // ABlockTransferSrcAccessOrder
- 2, // ABlockTransferSrcVectorDim
- 8, // ABlockTransferSrcScalarPerVector
- 8, // ABlockTransferDstScalarPerVector_K1
- true, // ABlockLdsAddExtraM
- S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
- S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
- S<1, 0, 2>, // BBlockTransferSrcAccessOrder
- 2, // BBlockTransferSrcVectorDim
- 8, // BBlockTransferSrcScalarPerVector
- 8, // BBlockTransferDstScalarPerVector_K1
- true, // BBlockLdsAddExtraN
- 1, // CShuffleMXdlPerWavePerShuffle
- 1, // CShuffleNXdlPerWavePerShuffle
- S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
- 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
-// clang-format on
-
-using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D;
-
-int main(int argc, char* argv[])
-{
- bool do_verification = true;
- int init_method = 1;
- bool time_kernel = false;
-
- // GEMM shape
- ck::index_t M = 3840;
- ck::index_t N = 4096;
- ck::index_t K = 4096;
-
- ck::index_t StrideA = 4096;
- ck::index_t StrideB = 4096;
- ck::index_t StrideC = 4096;
-
- float alpha = 1.0f;
- float beta = 1.0f;
-
- if(argc == 4)
- {
- do_verification = std::stoi(argv[1]);
- init_method = std::stoi(argv[2]);
- time_kernel = std::stoi(argv[3]);
- }
- else if(argc == 6)
- {
- do_verification = std::stoi(argv[1]);
- init_method = std::stoi(argv[2]);
- time_kernel = std::stoi(argv[3]);
-
- alpha = std::stof(argv[4]);
- beta = std::stof(argv[5]);
- }
- else if(argc == 12)
- {
- do_verification = std::stoi(argv[1]);
- init_method = std::stoi(argv[2]);
- time_kernel = std::stoi(argv[3]);
-
- M = std::stoi(argv[4]);
- N = std::stoi(argv[5]);
- K = std::stoi(argv[6]);
-
- StrideA = std::stoi(argv[7]);
- StrideB = std::stoi(argv[8]);
- StrideC = std::stoi(argv[9]);
-
- alpha = std::stof(argv[10]);
- beta = std::stof(argv[11]);
- }
- else
- {
- printf("arg1: verification (0=no, 1=yes)\n");
- printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: time kernel (0=n0, 1=yes)\n");
- printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n");
- exit(0);
- }
-
- auto f_host_tensor_descriptor =
- [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
- if(std::is_same::value)
- {
- return HostTensorDescriptor(std::vector({row, col}),
- std::vector({stride, 1}));
- }
- else
- {
- return HostTensorDescriptor(std::vector({row, col}),
- std::vector({1, stride}));
- }
- };
-
- Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
- Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
- Tensor c0_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
- Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
- Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-
- std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
- std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
- std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl;
- std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
-
- switch(init_method)
- {
- case 0: break;
- case 1:
- a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- c0_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- break;
- default:
- a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
- b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
- c0_m_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
- }
-
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem c0_m_n_device_buf(sizeof(CDataType) * c0_m_n.mDesc.GetElementSpace());
- DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
-
- a_m_k_device_buf.ToDevice(a_m_k.mData.data());
- b_k_n_device_buf.ToDevice(b_k_n.mData.data());
- c0_m_n_device_buf.ToDevice(c0_m_n.mData.data());
- c_m_n_device_buf.ToDevice(c_m_n_device_result.mData.data());
-
- // do GEMM
- auto gemm = DeviceGemmInstance{};
- auto invoker = gemm.MakeInvoker();
- auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()),
- static_cast(b_k_n_device_buf.GetDeviceBuffer()),
- static_cast(c0_m_n_device_buf.GetDeviceBuffer()),
- static_cast(c_m_n_device_buf.GetDeviceBuffer()),
- M,
- N,
- K,
- StrideA,
- StrideB,
- StrideC,
- AElementOp{},
- BElementOp{},
- CElementOp{alpha, beta});
-
- if(!gemm.IsSupportedArgument(argument))
- {
- throw std::runtime_error(
- "wrong! device_gemm with the specified compilation parameters does "
- "not support this GEMM problem");
- }
-
- float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-
- std::size_t flop = std::size_t(2) * M * N * K;
- std::size_t num_btype =
- sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
-
- float tflops = static_cast(flop) / 1.E9 / ave_time;
-
- float gb_per_sec = num_btype / 1.E6 / ave_time;
-
- std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
- << std::endl;
-
- c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
- if(do_verification)
- {
- auto ref_gemm = ReferenceGemmInstance{};
- auto ref_invoker = ref_gemm.MakeInvoker();
-
- auto ref_argument = ref_gemm.MakeArgument(a_m_k,
- b_k_n,
- c0_m_n,
- c_m_n_host_result,
- AElementOp{},
- BElementOp{},
- CElementOp{alpha, beta});
-
- ref_invoker.Run(ref_argument);
-
- return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
- }
-
- return 0;
-}
diff --git a/example/02_gemm_bilinear/CMakeLists.txt b/example/02_gemm_bilinear/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..10ec0f1a71151668e262efcdbaff7100d2d08dfa
--- /dev/null
+++ b/example/02_gemm_bilinear/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp)
diff --git a/example/02_gemm_alpha_beta/README.md b/example/02_gemm_bilinear/README.md
similarity index 69%
rename from example/02_gemm_alpha_beta/README.md
rename to example/02_gemm_bilinear/README.md
index ba2a3068f3e78757d34f3e9d7f382a76aef19bc5..9eb87e1e3479d72497ec72956b1de649b0ff735f 100644
--- a/example/02_gemm_alpha_beta/README.md
+++ b/example/02_gemm_bilinear/README.md
@@ -1,11 +1,13 @@
-# Instructions for ```example_gemm_xdl_alpha_beta```
+# Instructions for ```example_gemm_bilinear_xdl_fp16```
-## Run ```example_gemm_xdl_alpha_beta```
+## Run ```example_gemm_bilinear_xdl_fp16```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-./bin/example_gemm_xdl_alpha_beta 1 1 1 0.5 0.5
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE
+#arg11 to 12: alpha, beta
+./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5
```
Result (MI100 @ 1502Mhz, 184.6TFlops peak FP16)
```
diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..081f2b5142d759251241c021cd008c2104280dda
--- /dev/null
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+struct AlphaBetaAdd
+{
+ AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+ template
+ __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
+
+ template <>
+ __host__ __device__ constexpr void operator()(
+ ck::half_t& e, const float& c, const ck::half_t& d) const
+ {
+ e = ck::type_convert(alpha_ * c + beta_ * ck::type_convert(d));
+ };
+
+ float alpha_;
+ float beta_;
+};
+
+template
+using S = ck::Sequence;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType = F16;
+using BDataType = F16;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using DDataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AlphaBetaAdd;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance =
+ ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle,
+ ELayout,
+ ADataType,
+ BDataType,
+ AccDataType,
+ CShuffleDataType,
+ ck::Tuple,
+ EDataType,
+ AElementOp,
+ BElementOp,
+ CDEElementOp,
+ GemmSpec,
+ 1,
+ 256,
+ 256,
+ 128,
+ 32,
+ 8,
+ 8,
+ 32,
+ 32,
+ 4,
+ 2,
+ S<4, 64, 1>,
+ S<1, 0, 2>,
+ S<1, 0, 2>,
+ 2,
+ 8,
+ 8,
+ 1,
+ S<4, 64, 1>,
+ S<1, 0, 2>,
+ S<1, 0, 2>,
+ 2,
+ 8,
+ 8,
+ 1,
+ 1,
+ 1,
+ S<1, 32, 1, 8>,
+ 8>;
+
+int main(int argc, char* argv[])
+{
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
+
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideD = 4096;
+ ck::index_t StrideE = 4096;
+
+ float alpha = 1.0f;
+ float beta = 1.0f;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 4)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+ }
+ else if(argc == 6)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+
+ alpha = std::stof(argv[4]);
+ beta = std::stof(argv[5]);
+ }
+ else if(argc == 13)
+ {
+ do_verification = std::stoi(argv[1]);
+ init_method = std::stoi(argv[2]);
+ time_kernel = std::stoi(argv[3]);
+
+ M = std::stoi(argv[4]);
+ N = std::stoi(argv[5]);
+ K = std::stoi(argv[6]);
+
+ StrideA = std::stoi(argv[7]);
+ StrideB = std::stoi(argv[8]);
+ StrideD = std::stoi(argv[9]);
+ StrideE = std::stoi(argv[10]);
+
+ alpha = std::stof(argv[11]);
+ beta = std::stof(argv[12]);
+ }
+ else
+ {
+ printf("arg1: verification (0=no, 1=yes)\n");
+ printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+ printf("arg3: time kernel (0=no, 1=yes)\n");
+ printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
+ "beta\n");
+ exit(0);
+ }
+
+ auto f_host_tensor_descriptor =
+ [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+ if(std::is_same::value)
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({stride, 1}));
+ }
+ else
+ {
+ return HostTensorDescriptor(std::vector({row, col}),
+ std::vector({1, stride}));
+ }
+ };
+
+ Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+ Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+ Tensor d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+ Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+ Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+ std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+ std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+ std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+ std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+ switch(init_method)
+ {
+ case 0: break;
+ case 1:
+ a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ d_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ break;
+ default:
+ a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
+ d_m_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
+ }
+
+ DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+ DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+ a_device_buf.ToDevice(a_m_k.mData.data());
+ b_device_buf.ToDevice(b_k_n.mData.data());
+ d_device_buf.ToDevice(d_m_n.mData.data());
+ e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+ auto a_element_op = AElementOp{};
+ auto b_element_op = BElementOp{};
+ auto cde_element_op = CDEElementOp{alpha, beta};
+
+ // do GEMM
+ auto device_op = DeviceOpInstance{};
+ auto invoker = device_op.MakeInvoker();
+ auto argument =
+ device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ if(!device_op.IsSupportedArgument(argument))
+ {
+ throw std::runtime_error(
+ "wrong! device_gemm with the specified compilation parameters does "
+ "not support this GEMM problem");
+ }
+
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+ << std::endl;
+
+ e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+ if(do_verification)
+ {
+ Tensor c_m_n(HostTensorDescriptor(
+ std::vector{static_cast(M), static_cast(N)}));
+
+ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm;
+ auto ref_gemm = ReferenceGemmInstance{};
+ auto ref_invoker = ref_gemm.MakeInvoker();
+
+ auto ref_argument =
+ ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+ ref_invoker.Run(ref_argument);
+
+ for(int m = 0; m < M; ++m)
+ {
+ for(int n = 0; n < N; ++n)
+ {
+ cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+ }
+ }
+
+ e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+ return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
+ }
+
+ return 0;
+}
diff --git a/example/03_gemm_bias_relu/CMakeLists.txt b/example/03_gemm_bias_relu/CMakeLists.txt
index d07ad6e36c3a9f1deda141a66e20945c7fff37c1..35c54abac03094f24187df2503aa02b6812c20f3 100644
--- a/example/03_gemm_bias_relu/CMakeLists.txt
+++ b/example/03_gemm_bias_relu/CMakeLists.txt
@@ -1 +1 @@
-add_example_executable(example_gemm_xdl_bias_relu gemm_xdl_bias_relu.cpp)
+add_example_executable(example_gemm_bias_relu_xdl_fp16 gemm_bias_relu_xdl_fp16.cpp)
diff --git a/example/03_gemm_bias_relu/README.md b/example/03_gemm_bias_relu/README.md
index f8d9bd6152907de226567aefc85b91de00238e05..f28a9a071c879e92be34f84054661647c31ebb35 100644
--- a/example/03_gemm_bias_relu/README.md
+++ b/example/03_gemm_bias_relu/README.md
@@ -1,28 +1,10 @@
-# Instructions for ```example_gemm_xdl_bias_relu_add```
+# Instructions for ```example_gemm_bias_relu_xdl_fp16```
-## Run ```example_gemm_xdl_bias_relu_add```
+## Run ```example_gemm_bias_relu_xdl_fp16```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
-./bin/example_gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
-```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-c0_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-c1_m_n: dim 2, lengths {3840, 4096}, strides {1, 0}
-arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
-arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
-arg.c_grid_desc_m_n_{ 3840, 4096}
-arg.c0_grid_desc_m_n_{ 3840, 4096}
-arg.c1_grid_desc_m_n_{ 3840, 4096}
-launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.27583 ms, 100.992 TFlops, 73.9688 GB/s
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE
+./bin/example_gemm_bias_relu_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096
```
diff --git a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
similarity index 89%
rename from example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
rename to example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
index 25eadc5fd0251908ba2f77771ccbd39ced11a932..ae5e323410fd2a831cec5a222b32ec1a5d5795e2 100644
--- a/example/03_gemm_bias_relu/gemm_xdl_bias_relu.cpp
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -12,9 +12,9 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
@@ -47,33 +47,34 @@ using BDataType = F16;
using AccDataType = F32;
using CShuffleDataType = F16;
using DDataType = F16;
-using DsDataType = ck::Tuple;
using EDataType = F16;
using ALayout = Row;
using BLayout = Col;
+using DLayout = Row;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = AddRelu;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
using DeviceOpInstance =
ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle,
ELayout,
ADataType,
BDataType,
AccDataType,
CShuffleDataType,
- DsDataType,
+ ck::Tuple,
EDataType,
AElementOp,
BElementOp,
CDEElementOp,
- GemmDefault,
+ GemmSpec,
1,
256,
256,
@@ -191,14 +192,14 @@ int main(int argc, char* argv[])
d_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
}
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem d_m_n_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpace());
- DeviceMem e_m_n_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace());
+ DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+ DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
- a_m_k_device_buf.ToDevice(a_m_k.mData.data());
- b_k_n_device_buf.ToDevice(b_k_n.mData.data());
- d_m_n_device_buf.ToDevice(d_m_n.mData.data());
+ a_device_buf.ToDevice(a_m_k.mData.data());
+ b_device_buf.ToDevice(b_k_n.mData.data());
+ d_device_buf.ToDevice(d_m_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
@@ -210,10 +211,10 @@ int main(int argc, char* argv[])
auto invoker = device_op.MakeInvoker();
auto argument =
- device_op.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
- b_k_n_device_buf.GetDeviceBuffer(),
- std::array{d_m_n_device_buf.GetDeviceBuffer()},
- e_m_n_device_buf.GetDeviceBuffer(),
+ device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
M,
N,
K,
@@ -246,7 +247,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
- e_m_n_device_buf.FromDevice(e_m_n_device_result.mData.data());
+ e_device_buf.FromDevice(e_m_n_device_result.mData.data());
Tensor c_m_n(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
diff --git a/example/04_gemm_add_add_fastgelu/CMakeLists.txt b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
index 754de47c2b4556c62d7a04714224c29f23d0813f..0285a53f284193448b3eb7df1ac5609d0a7b6e9c 100644
--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -1 +1,4 @@
+add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
+add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
+add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f7a4fd8621ec30f794292a89bc45f35a27144bc
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template
+using S = ck::Sequence;
+
+using BF16 = ck::bhalf_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using ADataType = BF16;
+using BDataType = BF16;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using D0DataType = BF16;
+using D1DataType = BF16;
+using DsDataType = ck::Tuple;
+using EDataType = BF16;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple;
+using ELayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
+//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
+//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
+// clang-format on
+
+#include "run_gemm_add_add_fastgelu_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
index d907ab6b2495fe19a0ad9d2f37bea2f21c94e1b4..149cef6f81571b91936956ea16e919949675d189 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -1,10 +1,10 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include
#include
-#include
-#include
-#include
+#include
+#include
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -12,11 +12,12 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
template
using S = ck::Sequence;
@@ -43,6 +44,7 @@ using ALayout = Row;
using BLayout = Col;
using D0Layout = Row;
using D1Layout = Row;
+using DsLayout = ck::Tuple;
using ELayout = Row;
using AElementOp = PassThrough;
@@ -53,196 +55,13 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
-//######| ALayout| BLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
-//######| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
-//######| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
-//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
+//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
+//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
+//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
// clang-format on
-int main(int argc, char* argv[])
-{
- bool do_verification = true;
- int init_method = 1;
- bool time_kernel = false;
+#include "run_gemm_add_add_fastgelu_example.inc"
- // GEMM shape
- ck::index_t M = 3840;
- ck::index_t N = 4096;
- ck::index_t K = 4096;
-
- ck::index_t StrideA = 4096;
- ck::index_t StrideB = 4096;
- ck::index_t StrideD0 = 0;
- ck::index_t StrideD1 = 4096;
- ck::index_t StrideE = 4096;
-
- if(argc == 1)
- {
- // use default case
- }
- else if(argc == 4)
- {
- do_verification = std::stoi(argv[1]);
- init_method = std::stoi(argv[2]);
- time_kernel = std::stoi(argv[3]);
- }
- else if(argc == 12)
- {
- do_verification = std::stoi(argv[1]);
- init_method = std::stoi(argv[2]);
- time_kernel = std::stoi(argv[3]);
-
- M = std::stoi(argv[4]);
- N = std::stoi(argv[5]);
- K = std::stoi(argv[6]);
-
- StrideA = std::stoi(argv[7]);
- StrideB = std::stoi(argv[8]);
- StrideD0 = std::stoi(argv[9]);
- StrideD1 = std::stoi(argv[10]);
- StrideE = std::stoi(argv[11]);
- }
- else
- {
- printf("arg1: verification (0=no, 1=yes)\n");
- printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
- printf("arg3: time kernel (0=no, 1=yes)\n");
- printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
- "StrideE\n");
- exit(0);
- }
-
- auto f_host_tensor_descriptor =
- [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
- if(std::is_same::value)
- {
- return HostTensorDescriptor(std::vector({row, col}),
- std::vector({stride, 1}));
- }
- else
- {
- return HostTensorDescriptor(std::vector({row, col}),
- std::vector({1, stride}));
- }
- };
-
- Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
- Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
- Tensor d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
- Tensor d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
- Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
- Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
-
- std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
- std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
- std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
- std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
- std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
-
- switch(init_method)
- {
- case 0: break;
- case 1:
- a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- d0_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- d1_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
- break;
- default:
- a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
- b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
- d0_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
- d1_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
- }
-
- DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
- DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
- DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpace());
- DeviceMem d1_m_n_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpace());
- DeviceMem e_m_n_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace());
-
- a_m_k_device_buf.ToDevice(a_m_k.mData.data());
- b_k_n_device_buf.ToDevice(b_k_n.mData.data());
- d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
- d1_m_n_device_buf.ToDevice(d1_m_n.mData.data());
-
- auto a_element_op = AElementOp{};
- auto b_element_op = BElementOp{};
- auto cde_element_op = CDEElementOp{};
-
- // do GEMM
- auto device_op = DeviceOpInstance{};
- auto invoker = device_op.MakeInvoker();
- auto argument =
- device_op.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
- b_k_n_device_buf.GetDeviceBuffer(),
- std::array{d0_m_n_device_buf.GetDeviceBuffer(),
- d1_m_n_device_buf.GetDeviceBuffer()},
- e_m_n_device_buf.GetDeviceBuffer(),
- M,
- N,
- K,
- StrideA,
- StrideB,
- std::array{StrideD0, StrideD1},
- StrideE,
- a_element_op,
- b_element_op,
- cde_element_op);
-
- if(!device_op.IsSupportedArgument(argument))
- {
- throw std::runtime_error("wrong! this device_op instance does not support this problem");
- }
-
- float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
-
- std::size_t flop = std::size_t(2) * M * N * K;
- std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
- sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
- sizeof(EDataType) * M * N;
-
- float tflops = static_cast(flop) / 1.E9 / ave_time;
-
- float gb_per_sec = num_btype / 1.E6 / ave_time;
-
- std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
- << device_op.GetTypeString() << std::endl;
-
- if(do_verification)
- {
- Tensor c_m_n(HostTensorDescriptor(
- std::vector{static_cast(M), static_cast(N)}));
-
- using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm;
-
- auto ref_gemm = ReferenceGemmInstance{};
- auto ref_invoker = ref_gemm.MakeInvoker();
-
- auto ref_argument =
- ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
-
- ref_invoker.Run(ref_argument);
-
- for(int m = 0; m < M; ++m)
- {
- for(int n = 0; n < N; ++n)
- {
- cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
- }
- }
-
- e_m_n_device_buf.FromDevice(e_m_n_device_result.mData.data());
-
- return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
- }
-
- return 0;
-}
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dfef81fa0ce5d1a5a1331e5b7500e54ad77304d8
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template
+using S = ck::Sequence;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using ADataType = F32;
+using BDataType = F32;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using D0DataType = F32;
+using D1DataType = F32;
+using DsDataType = ck::Tuple;
+using EDataType = F32;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple;
+using ELayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
+//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
+//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 4>;
+// clang-format on
+
+#include "run_gemm_add_add_fastgelu_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c00339f7b81830beabca2718fac7cca6cf1e24a5
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template
+using S = ck::Sequence;
+
+using I8 = int8_t;
+using I32 = int32_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using ADataType = I8;
+using BDataType = I8;
+using AccDataType = I32;
+using CShuffleDataType = I32;
+using D0DataType = I8;
+using D1DataType = I8;
+using DsDataType = ck::Tuple;
+using EDataType = I8;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple;
+using ELayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
+//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
+//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 16>;
+// clang-format on
+
+#include "run_gemm_add_add_fastgelu_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
diff --git a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
new file mode 100644
index 0000000000000000000000000000000000000000..6358a4f106cc29982b76a28ad2d22c1ecccca02c
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
@@ -0,0 +1,203 @@
+#pragma once
+
+struct ProblemSize final
+{
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideD0 = 0;
+ ck::index_t StrideD1 = 4096;
+ ck::index_t StrideE = 4096;
+};
+
+struct ExecutionConfig final
+{
+ bool do_verification = true;
+ int init_method = 1;
+ bool time_kernel = false;
+};
+
+bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+ using namespace ck::literals;
+
+ auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+
+ auto f_host_tensor_descriptor =
+ [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+ if constexpr(std::is_same_v)
+ {
+ return HostTensorDescriptor({row, col}, {stride, 1_uz});
+ }
+ else
+ {
+ return HostTensorDescriptor({row, col}, {1_uz, stride});
+ }
+ };
+
+ Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+ Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+ Tensor d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+ Tensor d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+ Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+ Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+ std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+ std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+ std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+ std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+ std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+ switch(config.init_method)
+ {
+ case 0: break;
+ case 1:
+ a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ d0_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ d1_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+ break;
+ default:
+ a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5});
+ d0_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ d1_m_n.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0});
+ }
+
+ DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+ DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+ DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+ DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+ DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+ a_device_buf.ToDevice(a_m_k.mData.data());
+ b_device_buf.ToDevice(b_k_n.mData.data());
+ d0_device_buf.ToDevice(d0_m_n.mData.data());
+ d1_device_buf.ToDevice(d1_m_n.mData.data());
+
+ auto a_element_op = AElementOp{};
+ auto b_element_op = BElementOp{};
+ auto cde_element_op = CDEElementOp{};
+
+ // do GEMM
+ auto device_op = DeviceOpInstance{};
+ auto invoker = device_op.MakeInvoker();
+ auto argument =
+ device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ {StrideD0, StrideD1},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ if(!device_op.IsSupportedArgument(argument))
+ {
+ throw std::runtime_error("wrong! this device_op instance does not support this problem");
+ }
+
+ float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+ std::size_t flop = 2_uz * M * N * K;
+ std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+ sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
+ sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+ << device_op.GetTypeString() << std::endl;
+
+ if(config.do_verification)
+ {
+ Tensor c_m_n(HostTensorDescriptor{M, N});
+
+ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm;
+
+ auto ref_gemm = ReferenceGemmInstance{};
+ auto ref_invoker = ref_gemm.MakeInvoker();
+
+ auto ref_argument =
+ ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+ ref_invoker.Run(ref_argument);
+
+ for(int m = 0; m < M; ++m)
+ {
+ for(int n = 0; n < N; ++n)
+ {
+ cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+ }
+ }
+
+ e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+ return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
+ }
+
+ return true;
+}
+
+bool run_gemm_add_add_fastgelu_example(int argc, char* argv[])
+{
+ ProblemSize problem_size;
+ ExecutionConfig config;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 4)
+ {
+ config.do_verification = std::stoi(argv[1]);
+ config.init_method = std::stoi(argv[2]);
+ config.time_kernel = std::stoi(argv[3]);
+ }
+ else if(argc == 12)
+ {
+ config.do_verification = std::stoi(argv[1]);
+ config.init_method = std::stoi(argv[2]);
+ config.time_kernel = std::stoi(argv[3]);
+
+ problem_size.M = std::stoi(argv[4]);
+ problem_size.N = std::stoi(argv[5]);
+ problem_size.K = std::stoi(argv[6]);
+
+ problem_size.StrideA = std::stoi(argv[7]);
+ problem_size.StrideB = std::stoi(argv[8]);
+ problem_size.StrideD0 = std::stoi(argv[9]);
+ problem_size.StrideD1 = std::stoi(argv[10]);
+ problem_size.StrideE = std::stoi(argv[11]);
+ }
+ else
+ {
+ std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+ << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+ << std::endl
+ << "arg3: time kernel (0=no, 1=yes)" << std::endl
+ << "arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
+ "StrideE"
+ << std::endl;
+ return true;
+ }
+
+ return run_gemm_add_add_fastgelu(problem_size, config);
+}
diff --git a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
deleted file mode 100644
index 4e1dd1f3e6e98badb5b161e14dc77766e40ce3c7..0000000000000000000000000000000000000000
--- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
-target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_util)
diff --git a/example/06_conv2d_fwd_bias_relu/README.md b/example/06_conv2d_fwd_bias_relu/README.md
deleted file mode 100644
index 4c30563ef019096104027b11c4bbbe422442492d..0000000000000000000000000000000000000000
--- a/example/06_conv2d_fwd_bias_relu/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# Instructions for ```example_conv_xdl_bias_relu```
-
-## Run ```example_conv_xdl_bias_relu```
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./bin/example_conv_xdl_bias_relu 0 1 5
-```
-
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-bias_k: dim 1, lengths {256}, strides {1}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.39009 ms, 105.581 TFlops, 239.981 GB/s
-```
diff --git a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
deleted file mode 100644
index b3c492fd23fba59bb5578fb47278c2e0883aa41a..0000000000000000000000000000000000000000
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-
-#include
-#include
-#include
-#include
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/conv_util.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
-#include "ck/library/host_tensor/host_tensor.hpp"
-#include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
-
-namespace {
-
-using InDataType = ck::half_t;
-using WeiDataType = ck::half_t;
-using OutDataType = ck::half_t;
-using AccDataType = float;
-
-template
-using S = ck::Sequence;
-
-using InLayout = ck::tensor_layout::convolution::NHWC;
-using WeiLayout = ck::tensor_layout::convolution::KYXC;
-using OutLayout = ck::tensor_layout::convolution::NHWK;
-
-using InElementOp = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
-
-static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set;
-
-static constexpr auto ConvFwdDefault =
- ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
-
-// clang-format off
-using DeviceConvFwdInstance = ck::tensor_operation::device::
- DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
- InDataType, // InDataType
- WeiDataType, // WeiDataType
- OutDataType, // OutDataType
- AccDataType, // AccDataType
- InElementOp, // InElementwiseOperation
- WeiElementOp, // WeiElementwiseOperation
- OutElementOp, // OutElementwiseOperation
- MemorySet, // OutGlobalMemoryDataOperation
- ConvFwdDefault, // ConvForwardSpecialization
- 256, // BlockSize
- 128, // MPerBlock
- 256, // NPerBlock
- 4, // K0PerBlock
- 8, // K1
- 32, // MPerXdl
- 32, // NPerXdl
- 2, // MXdlPerWave
- 4, // NXdlPerWave
- S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
- S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
- S<1, 0, 2>, // ABlockTransferSrcAccessOrder
- 2, // ABlockTransferSrcVectorDim
- 8, // ABlockTransferSrcScalarPerVector
- 8, // ABlockTransferDstScalarPerVector_K1
- true, // ABlockLdsAddExtraM
- S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
- S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
- S<1, 0, 2>, // BBlockTransferSrcAccessOrder
- 2, // BBlockTransferSrcVectorDim
- 8, // BBlockTransferSrcScalarPerVector
- 8, // BBlockTransferDstScalarPerVector_K1
- true, // BBlockLdsAddExtraN
- 1, // CShuffleMXdlPerWavePerShuffle
- 1, // CShuffleNXdlPerWavePerShuffle
- S<1, 1, 32, 1, 1, 8>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
- 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl
-// clang-format on
-
-using ReferenceConvFwdInstance =
- ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation;
-
-void PrintUseMsg()
-{
- std::cout << "arg1: verification (0=no, 1=yes)\n"
- << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
- << "arg3: time kernel (0=n0, 1=yes)\n"
- << "Following arguments:\n"
- << " N, K, C, \n"
- << " , (ie Y, X for 2D)\n"
- << "