Commit 000eefbf authored by Chao Liu's avatar Chao Liu
Browse files

Merge remote-tracking branch 'origin/develop' into fused-gemm

parents b64a2860 cac014f1
...@@ -18,6 +18,89 @@ def runShell(String command){ ...@@ -18,6 +18,89 @@ def runShell(String command){
return (output != "") return (output != "")
} }
def getDockerImageName(){
def img = "${env.MIOPEN_IMAGE_URL}:composable_kernels_${params.COMPILER_VERSION}"
return img
}
def getDockerImage(Map conf=[:]){
env.DOCKER_BUILDKIT=1
def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
def gpu_arch = conf.get("gpu_arch", "gfx908") // prebuilt dockers should have all the architectures enabled so one image can be used for all stages
def no_cache = conf.get("no_cache", false)
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if(env.CCACHE_HOST)
{
def check_host = sh(script:"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
if(check_host == "+PONG")
{
echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
}
else
{
echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
}
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
env.CCACHE_DIR = """/tmp/ccache_store"""
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
}
if(no_cache)
{
dockerArgs = dockerArgs + " --no-cache "
}
echo "Docker Args: ${dockerArgs}"
def image = getDockerImageName()
//Check if image exists
def retimage
try
{
echo "Pulling down image: ${image}"
retimage = docker.image("${image}")
retimage.pull()
}
catch(Exception ex)
{
error "Unable to locate image: ${image}"
}
return [retimage, image]
}
def buildDocker(install_prefix){
show_node_info()
env.DOCKER_BUILDKIT=1
checkout scm
def image_name = getDockerImageName()
echo "Building Docker for ${image_name}"
def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if(env.CCACHE_HOST)
{
def check_host = sh(script:"""(printf "PING\\r\\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
if(check_host == "+PONG")
{
echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
}
else
{
echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
}
dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CCACHE_HOST}' --build-arg COMPILER_LAUNCHER='ccache' "
env.CCACHE_DIR = """/tmp/ccache_store"""
env.CCACHE_SECONDARY_STORAGE="""redis://${env.CCACHE_HOST}"""
}
echo "Build Args: ${dockerArgs}"
try{
echo "Checking for image: ${image_name}"
sh "docker manifest inspect --insecure ${image_name}"
echo "Image: ${image_name} found!! Skipping building image"
}
catch(Exception ex){
echo "Unable to locate image: ${image_name}. Building image now"
retimage = docker.build("${image_name}", dockerArgs + ' .')
retimage.push()
}
}
def cmake_build(Map conf=[:]){ def cmake_build(Map conf=[:]){
def compiler = conf.get("compiler","/opt/rocm/bin/hipcc") def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
...@@ -100,9 +183,10 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -100,9 +183,10 @@ def buildHipClangJob(Map conf=[:]){
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) { if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1" dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' " //def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' "
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if (params.COMPILER_VERSION != "release"){ if (params.COMPILER_VERSION != "release"){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -113,7 +197,8 @@ def buildHipClangJob(Map conf=[:]){ ...@@ -113,7 +197,8 @@ def buildHipClangJob(Map conf=[:]){
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
try { try {
retimage = docker.build("${image}", dockerArgs + '.') //retimage = docker.build("${image}", dockerArgs + '.')
(retimage, image) = getDockerImage(conf)
withDockerContainer(image: image, args: dockerOpts) { withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){ timeout(time: 5, unit: 'MINUTES'){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
...@@ -190,9 +275,9 @@ def runCKProfiler(Map conf=[:]){ ...@@ -190,9 +275,9 @@ def runCKProfiler(Map conf=[:]){
// def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" // def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) { if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1" dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='${params.COMPILER_VERSION}' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' "
if (params.COMPILER_VERSION != "release"){ if (params.COMPILER_VERSION != "release"){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
} }
...@@ -202,7 +287,8 @@ def runCKProfiler(Map conf=[:]){ ...@@ -202,7 +287,8 @@ def runCKProfiler(Map conf=[:]){
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
try { try {
retimage = docker.build("${image}", dockerArgs + '.') //retimage = docker.build("${image}", dockerArgs + '.')
(retimage, image) = getDockerImage(conf)
withDockerContainer(image: image, args: dockerOpts) { withDockerContainer(image: image, args: dockerOpts) {
timeout(time: 5, unit: 'MINUTES'){ timeout(time: 5, unit: 'MINUTES'){
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
...@@ -248,7 +334,7 @@ def runCKProfiler(Map conf=[:]){ ...@@ -248,7 +334,7 @@ def runCKProfiler(Map conf=[:]){
archiveArtifacts "perf_batched_gemm_${gpu_arch}.log" archiveArtifacts "perf_batched_gemm_${gpu_arch}.log"
archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log" archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
archiveArtifacts "perf_conv_fwd_${gpu_arch}.log" archiveArtifacts "perf_conv_fwd_${gpu_arch}.log"
archiveArtifacts "perf_conv_bwd_${gpu_arch}.log" archiveArtifacts "perf_conv_bwd_data_${gpu_arch}.log"
archiveArtifacts "perf_gemm_bilinear_${gpu_arch}.log" archiveArtifacts "perf_gemm_bilinear_${gpu_arch}.log"
archiveArtifacts "perf_reduction_${gpu_arch}.log" archiveArtifacts "perf_reduction_${gpu_arch}.log"
// stash perf files to master // stash perf files to master
...@@ -258,7 +344,7 @@ def runCKProfiler(Map conf=[:]){ ...@@ -258,7 +344,7 @@ def runCKProfiler(Map conf=[:]){
stash name: "perf_batched_gemm_${gpu_arch}.log" stash name: "perf_batched_gemm_${gpu_arch}.log"
stash name: "perf_grouped_gemm_${gpu_arch}.log" stash name: "perf_grouped_gemm_${gpu_arch}.log"
stash name: "perf_conv_fwd_${gpu_arch}.log" stash name: "perf_conv_fwd_${gpu_arch}.log"
stash name: "perf_conv_bwd_${gpu_arch}.log" stash name: "perf_conv_bwd_data_${gpu_arch}.log"
stash name: "perf_gemm_bilinear_${gpu_arch}.log" stash name: "perf_gemm_bilinear_${gpu_arch}.log"
stash name: "perf_reduction_${gpu_arch}.log" stash name: "perf_reduction_${gpu_arch}.log"
//we will process results on the master node //we will process results on the master node
...@@ -308,16 +394,17 @@ def process_results(Map conf=[:]){ ...@@ -308,16 +394,17 @@ def process_results(Map conf=[:]){
// Jenkins is complaining about the render group // Jenkins is complaining about the render group
def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined" def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
if (conf.get("enforce_xnack_on", false)) { if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1" dockerOpts = dockerOpts + " --env HSA_XNACK=1 --env GPU_ARCH='${gpu_arch}' "
} }
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' " def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='release' "
def variant = env.STAGE_NAME def variant = env.STAGE_NAME
def retimage def retimage
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
try { try {
retimage = docker.build("${image}", dockerArgs + '.') //retimage = docker.build("${image}", dockerArgs + '.')
(retimage, image) = getDockerImage(conf)
} }
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
echo "The job was cancelled or aborted" echo "The job was cancelled or aborted"
...@@ -337,7 +424,7 @@ def process_results(Map conf=[:]){ ...@@ -337,7 +424,7 @@ def process_results(Map conf=[:]){
unstash "perf_batched_gemm_${gpu_arch}.log" unstash "perf_batched_gemm_${gpu_arch}.log"
unstash "perf_grouped_gemm_${gpu_arch}.log" unstash "perf_grouped_gemm_${gpu_arch}.log"
unstash "perf_conv_fwd_${gpu_arch}.log" unstash "perf_conv_fwd_${gpu_arch}.log"
unstash "perf_conv_bwd${gpu_arch}.log" unstash "perf_conv_bwd_data_${gpu_arch}.log"
unstash "perf_gemm_bilinear_${gpu_arch}.log" unstash "perf_gemm_bilinear_${gpu_arch}.log"
unstash "perf_reduction_${gpu_arch}.log" unstash "perf_reduction_${gpu_arch}.log"
sh "./process_qa_data.sh ${gpu_arch}" sh "./process_qa_data.sh ${gpu_arch}"
...@@ -372,14 +459,22 @@ pipeline { ...@@ -372,14 +459,22 @@ pipeline {
parallelsAlwaysFailFast() parallelsAlwaysFailFast()
} }
parameters { parameters {
booleanParam(
name: "BUILD_DOCKER",
defaultValue: true,
description: "Force building docker image (default: true)")
string( string(
name: 'COMPILER_VERSION', name: 'COMPILER_VERSION',
defaultValue: 'ck-9110', defaultValue: 'ck-9110',
description: 'Specify which version of compiler to use: ck-9110 (default), release, or amd-mainline-open.') description: 'Specify which version of compiler to use: ck-9110 (default), release, or amd-stg-open.')
booleanParam( booleanParam(
name: "RUN_FULL_QA", name: "RUN_FULL_QA",
defaultValue: false, defaultValue: false,
description: "Select whether to run small set of performance tests (default) or full QA") description: "Select whether to run small set of performance tests (default) or full QA")
booleanParam(
name: "TEST_NODE_PERFORMANCE",
defaultValue: false,
description: "Test the node GPU performance (default: false)")
} }
environment{ environment{
dbuser = "${dbuser}" dbuser = "${dbuser}"
...@@ -393,7 +488,24 @@ pipeline { ...@@ -393,7 +488,24 @@ pipeline {
DOCKER_BUILDKIT = "1" DOCKER_BUILDKIT = "1"
} }
stages{ stages{
stage("Build Docker"){
when {
expression { params.BUILD_DOCKER.toBoolean() }
}
parallel{
stage('Docker /opt/rocm'){
agent{ label rocmnode("nogpu") }
steps{
buildDocker('/opt/rocm')
}
}
}
}
stage("Static checks") { stage("Static checks") {
when {
beforeAgent true
expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
}
parallel{ parallel{
// enable after we move from hipcc to hip-clang // enable after we move from hipcc to hip-clang
// stage('Tidy') { // stage('Tidy') {
...@@ -427,6 +539,10 @@ pipeline { ...@@ -427,6 +539,10 @@ pipeline {
} }
stage("Tests") stage("Tests")
{ {
when {
beforeAgent true
expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
}
parallel parallel
{ {
stage("Run Tests: gfx908") stage("Run Tests: gfx908")
...@@ -457,6 +573,10 @@ pipeline { ...@@ -457,6 +573,10 @@ pipeline {
} }
stage("Client App") stage("Client App")
{ {
when {
beforeAgent true
expression { !params.TEST_NODE_PERFORMANCE.toBoolean() }
}
parallel parallel
{ {
stage("Run Client App") stage("Run Client App")
...@@ -480,7 +600,7 @@ pipeline { ...@@ -480,7 +600,7 @@ pipeline {
{ {
when { when {
beforeAgent true beforeAgent true
expression { !params.RUN_FULL_QA.toBoolean() } expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
} }
agent{ label rocmnode("gfx908")} agent{ label rocmnode("gfx908")}
environment{ environment{
...@@ -494,7 +614,7 @@ pipeline { ...@@ -494,7 +614,7 @@ pipeline {
{ {
when { when {
beforeAgent true beforeAgent true
expression { params.RUN_FULL_QA.toBoolean() } expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
} }
agent{ label rocmnode("gfx90a")} agent{ label rocmnode("gfx90a")}
environment{ environment{
...@@ -513,7 +633,7 @@ pipeline { ...@@ -513,7 +633,7 @@ pipeline {
stage("Process results for gfx908"){ stage("Process results for gfx908"){
when { when {
beforeAgent true beforeAgent true
expression { !params.RUN_FULL_QA.toBoolean() } expression { !params.RUN_FULL_QA.toBoolean() && !params.TEST_NODE_PERFORMANCE.toBoolean() }
} }
agent { label 'mici' } agent { label 'mici' }
steps{ steps{
...@@ -523,7 +643,7 @@ pipeline { ...@@ -523,7 +643,7 @@ pipeline {
stage("Process results for gfx90a"){ stage("Process results for gfx90a"){
when { when {
beforeAgent true beforeAgent true
expression { params.RUN_FULL_QA.toBoolean() } expression { params.RUN_FULL_QA.toBoolean() || params.TEST_NODE_PERFORMANCE.toBoolean() }
} }
agent { label 'mici' } agent { label 'mici' }
steps{ steps{
......
add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp) add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstddef>
#include <iostream>
#include <stdexcept>
#include <string>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using BF16 = ck::bhalf_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
using ADataType = BF16;
using BDataType = BF16;
using AccDataType = F32;
using CShuffleDataType = F32;
using D0DataType = BF16;
using D1DataType = BF16;
using DsDataType = ck::Tuple<D0DataType, D1DataType>;
using EDataType = BF16;
using ALayout = Row;
using BLayout = Col;
using D0Layout = Row;
using D1Layout = Row;
using DsLayout = ck::Tuple<D0Layout, D1Layout>;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = AddAddFastGelu;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
// clang-format on
#include "run_gemm_add_add_fastgelu_example.inc"
int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstddef>
#include <iostream> #include <iostream>
#include <numeric> #include <stdexcept>
#include <initializer_list> #include <string>
#include <cstdlib>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
...@@ -12,11 +12,12 @@ ...@@ -12,11 +12,12 @@
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/utility/check_err.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -61,189 +62,6 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C ...@@ -61,189 +62,6 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
// clang-format on // clang-format on
int main(int argc, char* argv[]) #include "run_gemm_add_add_fastgelu_example.inc"
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// GEMM shape
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideD0 = 0;
ck::index_t StrideD1 = 4096;
ck::index_t StrideE = 4096;
if(argc == 1)
{
// use default case
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 12)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideD0 = std::stoi(argv[9]);
StrideD1 = std::stoi(argv[10]);
StrideE = std::stoi(argv[11]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
"StrideE\n");
exit(0);
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d0_device_buf.ToDevice(d0_m_n.mData.data());
d1_device_buf.ToDevice(d1_m_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{};
// do GEMM
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument =
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 2>{d0_device_buf.GetDeviceBuffer(),
d1_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 2>{StrideD0, StrideD1},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
if(!device_op.IsSupportedArgument(argument))
{
throw std::runtime_error("wrong! this device_op instance does not support this problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< device_op.GetTypeString() << std::endl;
if(do_verification)
{
Tensor<AccDataType> c_m_n(HostTensorDescriptor(
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
AccDataType,
AccDataType,
AElementOp,
BElementOp,
PassThrough>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument =
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
for(int m = 0; m < M; ++m)
{
for(int n = 0; n < N; ++n)
{
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
}
}
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
}
return 0; int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstddef>
#include <iostream>
#include <stdexcept>
#include <string>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
using ADataType = F32;
using BDataType = F32;
using AccDataType = F32;
using CShuffleDataType = F32;
using D0DataType = F32;
using D1DataType = F32;
using DsDataType = ck::Tuple<D0DataType, D1DataType>;
using EDataType = F32;
using ALayout = Row;
using BLayout = Col;
using D0Layout = Row;
using D1Layout = Row;
using DsLayout = ck::Tuple<D0Layout, D1Layout>;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = AddAddFastGelu;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 32, 1, 8>, 4>;
// clang-format on
#include "run_gemm_add_add_fastgelu_example.inc"
int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstddef>
#include <iostream>
#include <stdexcept>
#include <string>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using I8 = int8_t;
using I32 = int32_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
using ADataType = I8;
using BDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I32;
using D0DataType = I8;
using D1DataType = I8;
using DsDataType = ck::Tuple<D0DataType, D1DataType>;
using EDataType = I8;
using ALayout = Row;
using BLayout = Col;
using D0Layout = Row;
using D1Layout = Row;
using DsLayout = ck::Tuple<D0Layout, D1Layout>;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = AddAddFastGelu;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
//######| ALayout| BLayout| DsLayout| ELayout| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AElementOp, BElementOp, CDEElementOp, GemmDefault, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 16>;
// clang-format on
#include "run_gemm_add_add_fastgelu_example.inc"
int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
#pragma once
struct ProblemSize final
{
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideD0 = 0;
ck::index_t StrideD1 = 4096;
ck::index_t StrideE = 4096;
};
struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
};
bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
switch(config.init_method)
{
case 0: break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d0_device_buf.ToDevice(d0_m_n.mData.data());
d1_device_buf.ToDevice(d1_m_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{};
// do GEMM
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument =
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
{d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
{StrideD0, StrideD1},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
if(!device_op.IsSupportedArgument(argument))
{
throw std::runtime_error("wrong! this device_op instance does not support this problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< device_op.GetTypeString() << std::endl;
if(config.do_verification)
{
Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
AccDataType,
AccDataType,
AElementOp,
BElementOp,
PassThrough>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument =
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
for(int m = 0; m < M; ++m)
{
for(int n = 0; n < N; ++n)
{
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
}
}
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
}
return true;
}
bool run_gemm_add_add_fastgelu_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
if(argc == 1)
{
// use default case
}
else if(argc == 4)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else if(argc == 12)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
problem_size.M = std::stoi(argv[4]);
problem_size.N = std::stoi(argv[5]);
problem_size.K = std::stoi(argv[6]);
problem_size.StrideA = std::stoi(argv[7]);
problem_size.StrideB = std::stoi(argv[8]);
problem_size.StrideD0 = std::stoi(argv[9]);
problem_size.StrideD1 = std::stoi(argv[10]);
problem_size.StrideE = std::stoi(argv[11]);
}
else
{
std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
"StrideE"
<< std::endl;
return true;
}
return run_gemm_add_add_fastgelu(problem_size, config);
}
add_example_executable(example_gemm_bias_e_permute_m3n2_xdl_fp16 gemm_bias_e_permute_m3n2_xdl_fp16.cpp) add_example_executable(example_gemm_bias_e_permute_g1m3n2k1_xdl_fp16 gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp)
add_example_executable(example_gemm_bias_e_permute_m2n3_xdl_fp16 gemm_bias_e_permute_m2n3_xdl_fp16.cpp) add_example_executable(example_gemm_bias_e_permute_g1m2n3k1_xdl_fp16 gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp)
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -33,7 +35,7 @@ using DDataType = F16; ...@@ -33,7 +35,7 @@ using DDataType = F16;
using DsDataType = ck::Tuple<DDataType>; using DsDataType = ck::Tuple<DDataType>;
using EDataType = F16; using EDataType = F16;
static constexpr ck::index_t NumDimG = 0; static constexpr ck::index_t NumDimG = 1;
static constexpr ck::index_t NumDimM = 2; static constexpr ck::index_t NumDimM = 2;
static constexpr ck::index_t NumDimN = 3; static constexpr ck::index_t NumDimN = 3;
static constexpr ck::index_t NumDimK = 1; static constexpr ck::index_t NumDimK = 1;
...@@ -69,30 +71,31 @@ template <ck::index_t NumDimM, ...@@ -69,30 +71,31 @@ template <ck::index_t NumDimM,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CDEElementwiseOperation, typename CDEElementwiseOperation,
ck::enable_if_t<NumDimM == 2 && NumDimN == 3 && NumDimK == 1, bool> = false> ck::enable_if_t<NumDimG == 1 && NumDimM == 2 && NumDimN == 3 && NumDimK == 1, bool> =
struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::BaseOperator false>
struct ReferenceContraction_G1_M2_N3_K1 : public ck::tensor_operation::device::BaseOperator
{ {
// Argument // Argument
struct Argument : public ck::tensor_operation::device::BaseArgument struct Argument : public ck::tensor_operation::device::BaseArgument
{ {
Argument(const Tensor<ADataType>& a_ms_ks, Argument(const Tensor<ADataType>& a_gs_ms_ks,
const Tensor<BDataType>& b_ns_ks, const Tensor<BDataType>& b_gs_ns_ks,
Tensor<EDataType>& e_ms_ns, Tensor<EDataType>& e_gs_ms_ns,
AElementwiseOperation a_element_op, AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op, BElementwiseOperation b_element_op,
CDEElementwiseOperation cde_element_op) CDEElementwiseOperation cde_element_op)
: a_ms_ks_{a_ms_ks}, : a_gs_ms_ks_{a_gs_ms_ks},
b_ns_ks_{b_ns_ks}, b_gs_ns_ks_{b_gs_ns_ks},
e_ms_ns_{e_ms_ns}, e_gs_ms_ns_{e_gs_ms_ns},
a_element_op_{a_element_op}, a_element_op_{a_element_op},
b_element_op_{b_element_op}, b_element_op_{b_element_op},
cde_element_op_{cde_element_op} cde_element_op_{cde_element_op}
{ {
} }
const Tensor<ADataType>& a_ms_ks_; const Tensor<ADataType>& a_gs_ms_ks_;
const Tensor<BDataType>& b_ns_ks_; const Tensor<BDataType>& b_gs_ns_ks_;
Tensor<EDataType>& e_ms_ns_; Tensor<EDataType>& e_gs_ms_ns_;
AElementwiseOperation a_element_op_; AElementwiseOperation a_element_op_;
BElementwiseOperation b_element_op_; BElementwiseOperation b_element_op_;
...@@ -102,12 +105,12 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base ...@@ -102,12 +105,12 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base
// Invoker // Invoker
struct Invoker : public ck::tensor_operation::device::BaseInvoker struct Invoker : public ck::tensor_operation::device::BaseInvoker
{ {
using Argument = ReferenceContraction_M2_N3_K1::Argument; using Argument = ReferenceContraction_G1_M2_N3_K1::Argument;
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1, auto n2) { auto f_gs_ms_ns = [&](auto g0, auto m0, auto m1, auto n0, auto n1, auto n2) {
const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2]; const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[3];
AccDataType v_acc = 0; AccDataType v_acc = 0;
...@@ -117,9 +120,10 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base ...@@ -117,9 +120,10 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base
AccDataType v_b; AccDataType v_b;
arg.a_element_op_( arg.a_element_op_(
v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0))); v_a, ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, m0, m1, k0)));
arg.b_element_op_( arg.b_element_op_(
v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, n2, k0))); v_b,
ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, n0, n1, n2, k0)));
v_acc += v_a * v_b; v_acc += v_a * v_b;
} }
...@@ -128,15 +132,16 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base ...@@ -128,15 +132,16 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base
arg.cde_element_op_(v_c, v_acc); arg.cde_element_op_(v_c, v_acc);
arg.e_ms_ns_(m0, m1, n0, n1, n2) = v_c; arg.e_gs_ms_ns_(g0, m0, m1, n0, n1, n2) = v_c;
}; };
make_ParallelTensorFunctor(f_ms_ns, make_ParallelTensorFunctor(f_gs_ms_ns,
arg.e_ms_ns_.mDesc.GetLengths()[0], arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
arg.e_ms_ns_.mDesc.GetLengths()[1], arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
arg.e_ms_ns_.mDesc.GetLengths()[2], arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
arg.e_ms_ns_.mDesc.GetLengths()[3], arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
arg.e_ms_ns_.mDesc.GetLengths()[4])( arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
...@@ -160,14 +165,15 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base ...@@ -160,14 +165,15 @@ struct ReferenceContraction_M2_N3_K1 : public ck::tensor_operation::device::Base
return true; return true;
} }
static auto MakeArgument(const Tensor<ADataType>& a_ms_ks, static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
const Tensor<BDataType>& b_ns_ks, const Tensor<BDataType>& b_gs_ns_ks,
Tensor<EDataType>& e_ms_ns, Tensor<EDataType>& e_gs_ms_ns,
AElementwiseOperation a_element_op, AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op, BElementwiseOperation b_element_op,
CDEElementwiseOperation cde_element_op) CDEElementwiseOperation cde_element_op)
{ {
return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op}; return Argument{
a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
} }
static auto MakeInvoker() { return Invoker{}; } static auto MakeInvoker() { return Invoker{}; }
...@@ -196,28 +202,31 @@ int main(int argc, char* argv[]) ...@@ -196,28 +202,31 @@ int main(int argc, char* argv[])
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
ck::index_t G0 = 1;
ck::index_t M0 = 4; ck::index_t M0 = 4;
ck::index_t M1 = 256; ck::index_t M1 = 256;
ck::index_t N0 = 4; ck::index_t N0 = 4;
ck::index_t N1 = 8; ck::index_t N1 = 16;
ck::index_t N2 = 128; ck::index_t N2 = 32;
ck::index_t K0 = 256; ck::index_t K0 = 256;
// A[M0, M1, M2, K0] // A[M0, M1, M2, K0]
std::vector<ck::index_t> a_ms_ks_lengths{M0, M1, K0}; std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, M0, M1, K0};
std::vector<ck::index_t> a_ms_ks_strides{M1 * K0, K0, 1}; std::vector<ck::index_t> a_gs_ms_ks_strides{M0 * M1 * K0, M1 * K0, K0, 1};
// B[N0, N1, K0] // B[N0, N1, K0]
std::vector<ck::index_t> b_ns_ks_lengths{N0, N1, N2, K0}; std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, N0, N1, N2, K0};
std::vector<ck::index_t> b_ns_ks_strides{N1 * N2 * K0, N2 * K0, K0, 1}; std::vector<ck::index_t> b_gs_ns_ks_strides{N0 * N1 * N2 * K0, N1 * N2 * K0, N2 * K0, K0, 1};
// D[N0, M0, N1, M1, N2] // D[N0, M0, N1, M1, N2]
std::vector<ck::index_t> d_ms_ns_lengths{M0, M1, N0, N1, N2}; std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
std::vector<ck::index_t> d_ms_ns_strides{0, 0, N1 * N2, N1, 1}; std::vector<ck::index_t> d_gs_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1};
// E[N0, M0, N1, M1, N2] // E[N0, M0, N1, M1, N2]
std::vector<ck::index_t> e_ms_ns_lengths{M0, M1, N0, N1, N2}; std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
std::vector<ck::index_t> e_ms_ns_strides{N1 * M1 * N2, N2, M0 * N1 * M1 * N2, M1 * N2, 1}; std::vector<ck::index_t> e_gs_ms_ns_strides{
M0 * M1 * N0 * N1 * N2, N1 * M1 * N2, N2, M0 * N1 * M1 * N2, M1 * N2, 1};
if(argc == 1) if(argc == 1)
{ {
...@@ -237,50 +246,51 @@ int main(int argc, char* argv[]) ...@@ -237,50 +246,51 @@ int main(int argc, char* argv[])
exit(0); exit(0);
} }
Tensor<ADataType> a_ms_ks( Tensor<ADataType> a_gs_ms_ks(
std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()), std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end())); std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
Tensor<BDataType> b_ns_ks( Tensor<BDataType> b_gs_ns_ks(
std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()), std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end())); std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
Tensor<DDataType> d_ms_ns( Tensor<DDataType> d_gs_ms_ns(
std::vector<std::size_t>(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()), std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(d_ms_ns_strides.begin(), d_ms_ns_strides.end())); std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
Tensor<EDataType> e_ms_ns_host_result( Tensor<EDataType> e_gs_ms_ns_host_result(
std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
Tensor<EDataType> e_ms_ns_device_result( Tensor<EDataType> e_gs_ms_ns_device_result(
std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl; std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl; std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl; std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl; std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
case 1: case 1:
a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}); a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break; break;
default: default:
a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break; break;
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize()); DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(sizeof(EDataType) *
e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_ms_ks.mData.data()); a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
b_device_buf.ToDevice(b_ns_ks.mData.data()); b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
d_device_buf.ToDevice(d_ms_ns.mData.data()); d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
// set zero // set zero
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -296,14 +306,14 @@ int main(int argc, char* argv[]) ...@@ -296,14 +306,14 @@ int main(int argc, char* argv[])
b_device_buf.GetDeviceBuffer(), b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()}, std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(), e_device_buf.GetDeviceBuffer(),
a_ms_ks_lengths, a_gs_ms_ks_lengths,
a_ms_ks_strides, a_gs_ms_ks_strides,
b_ns_ks_lengths, b_gs_ns_ks_lengths,
b_ns_ks_strides, b_gs_ns_ks_strides,
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths}, std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides}, std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
e_ms_ns_lengths, e_gs_ms_ns_lengths,
e_ms_ns_strides, e_gs_ms_ns_strides,
a_element_op, a_element_op,
b_element_op, b_element_op,
cde_element_op); cde_element_op);
...@@ -317,18 +327,18 @@ int main(int argc, char* argv[]) ...@@ -317,18 +327,18 @@ int main(int argc, char* argv[])
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(), std::size_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
e_ms_ns_lengths.begin() + NumDimM, e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
ck::index_t{1}, ck::index_t{1},
std::multiplies<ck::index_t>{}); std::multiplies<ck::index_t>{});
ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM, std::size_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
e_ms_ns_lengths.begin() + NumDimM + NumDimN, e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
ck::index_t{1}, ck::index_t{1},
std::multiplies<ck::index_t>{}); std::multiplies<ck::index_t>{});
ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM, std::size_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
a_ms_ks_lengths.begin() + NumDimM + NumDimK, a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
ck::index_t{1}, ck::index_t{1},
std::multiplies<ck::index_t>{}); std::multiplies<ck::index_t>{});
...@@ -343,15 +353,15 @@ int main(int argc, char* argv[]) ...@@ -343,15 +353,15 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< op.GetTypeString() << std::endl; << op.GetTypeString() << std::endl;
e_device_buf.FromDevice(e_ms_ns_device_result.mData.data()); e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
if(do_verification) if(do_verification)
{ {
Tensor<CShuffleDataType> c_ms_ns_host_result( Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
using ReferenceOpInstance = ReferenceContraction_M2_N3_K1<NumDimM, using ReferenceOpInstance = ReferenceContraction_G1_M2_N3_K1<NumDimM,
NumDimN, NumDimN,
NumDimK, NumDimK,
ADataType, ADataType,
...@@ -365,31 +375,41 @@ int main(int argc, char* argv[]) ...@@ -365,31 +375,41 @@ int main(int argc, char* argv[])
auto ref_gemm = ReferenceOpInstance{}; auto ref_gemm = ReferenceOpInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument( auto ref_argument = ref_gemm.MakeArgument(a_gs_ms_ks,
a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{}); b_gs_ns_ks,
c_gs_ms_ns_host_result,
a_element_op,
b_element_op,
PassThrough{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0) for(size_t g0 = 0; g0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[0]; ++g0)
{
for(size_t m0 = 0; m0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[1]; ++m0)
{ {
for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1) for(size_t m1 = 0; m1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[2]; ++m1)
{ {
for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0) for(size_t n0 = 0; n0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[3]; ++n0)
{ {
for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1) for(size_t n1 = 0; n1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[4]; ++n1)
{ {
for(size_t n2 = 0; n2 < e_ms_ns_host_result.mDesc.GetLengths()[4]; ++n2) for(size_t n2 = 0; n2 < e_gs_ms_ns_host_result.mDesc.GetLengths()[5];
++n2)
{ {
cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1, n2), cde_element_op(e_gs_ms_ns_host_result(g0, m0, m1, n0, n1, n2),
c_ms_ns_host_result(m0, m1, n0, n1, n2), c_gs_ms_ns_host_result(g0, m0, m1, n0, n1, n2),
d_ms_ns(m0, m1, n0, n1, n2)); d_gs_ms_ns(g0, m0, m1, n0, n1, n2));
}
} }
} }
} }
} }
} }
return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1; return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
? 0
: 1;
} }
return 0; return 0;
......
...@@ -33,7 +33,7 @@ using DDataType = F16; ...@@ -33,7 +33,7 @@ using DDataType = F16;
using DsDataType = ck::Tuple<DDataType>; using DsDataType = ck::Tuple<DDataType>;
using EDataType = F16; using EDataType = F16;
static constexpr ck::index_t NumDimG = 0; static constexpr ck::index_t NumDimG = 1;
static constexpr ck::index_t NumDimM = 3; static constexpr ck::index_t NumDimM = 3;
static constexpr ck::index_t NumDimN = 2; static constexpr ck::index_t NumDimN = 2;
static constexpr ck::index_t NumDimK = 1; static constexpr ck::index_t NumDimK = 1;
...@@ -53,13 +53,13 @@ using DeviceOpInstanceKKNN = ck::tensor_operation::device:: ...@@ -53,13 +53,13 @@ using DeviceOpInstanceKKNN = ck::tensor_operation::device::
//############################################| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| //############################################| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//############################################| | | | | | | | | | | Operation| Operation| Operation| | | | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //############################################| | | | | | | | | | | Operation| Operation| Operation| | | | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, F16, F16, F32, F16, DsDataType, F16, AElementOp, BElementOp, CDEElementOp, GemmSpec, ABSpec, ABSpec, DESpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 1>; DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, F16, F16, F32, F16, DsDataType, F16, AElementOp, BElementOp, CDEElementOp, GemmSpec, ABSpec, ABSpec, DESpec, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>;
// clang-format on // clang-format on
using DeviceOpInstance = DeviceOpInstanceKKNN; using DeviceOpInstance = DeviceOpInstanceKKNN;
// hardcoded for NumDimM == NumDimN == NumDimK == 2 template <ck::index_t NumDimG,
template <ck::index_t NumDimM, ck::index_t NumDimM,
ck::index_t NumDimN, ck::index_t NumDimN,
ck::index_t NumDimK, ck::index_t NumDimK,
typename ADataType, typename ADataType,
...@@ -69,30 +69,31 @@ template <ck::index_t NumDimM, ...@@ -69,30 +69,31 @@ template <ck::index_t NumDimM,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CDEElementwiseOperation, typename CDEElementwiseOperation,
ck::enable_if_t<NumDimM == 3 && NumDimN == 2 && NumDimK == 1, bool> = false> ck::enable_if_t<NumDimG == 1 && NumDimM == 3 && NumDimN == 2 && NumDimK == 1, bool> =
struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::BaseOperator false>
struct ReferenceContraction_G1_M3_N2_K1 : public ck::tensor_operation::device::BaseOperator
{ {
// Argument // Argument
struct Argument : public ck::tensor_operation::device::BaseArgument struct Argument : public ck::tensor_operation::device::BaseArgument
{ {
Argument(const Tensor<ADataType>& a_ms_ks, Argument(const Tensor<ADataType>& a_gs_ms_ks,
const Tensor<BDataType>& b_ns_ks, const Tensor<BDataType>& b_gs_ns_ks,
Tensor<EDataType>& e_ms_ns, Tensor<EDataType>& e_gs_ms_ns,
AElementwiseOperation a_element_op, AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op, BElementwiseOperation b_element_op,
CDEElementwiseOperation cde_element_op) CDEElementwiseOperation cde_element_op)
: a_ms_ks_{a_ms_ks}, : a_gs_ms_ks_{a_gs_ms_ks},
b_ns_ks_{b_ns_ks}, b_gs_ns_ks_{b_gs_ns_ks},
e_ms_ns_{e_ms_ns}, e_gs_ms_ns_{e_gs_ms_ns},
a_element_op_{a_element_op}, a_element_op_{a_element_op},
b_element_op_{b_element_op}, b_element_op_{b_element_op},
cde_element_op_{cde_element_op} cde_element_op_{cde_element_op}
{ {
} }
const Tensor<ADataType>& a_ms_ks_; const Tensor<ADataType>& a_gs_ms_ks_;
const Tensor<BDataType>& b_ns_ks_; const Tensor<BDataType>& b_gs_ns_ks_;
Tensor<EDataType>& e_ms_ns_; Tensor<EDataType>& e_gs_ms_ns_;
AElementwiseOperation a_element_op_; AElementwiseOperation a_element_op_;
BElementwiseOperation b_element_op_; BElementwiseOperation b_element_op_;
...@@ -102,12 +103,12 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base ...@@ -102,12 +103,12 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
// Invoker // Invoker
struct Invoker : public ck::tensor_operation::device::BaseInvoker struct Invoker : public ck::tensor_operation::device::BaseInvoker
{ {
using Argument = ReferenceContraction_M3_N2_K1::Argument; using Argument = ReferenceContraction_G1_M3_N2_K1::Argument;
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
auto f_ms_ns = [&](auto m0, auto m1, auto m2, auto n0, auto n1) { auto f_gs_ms_ns = [&](auto g0, auto m0, auto m1, auto m2, auto n0, auto n1) {
const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[3]; const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4];
AccDataType v_acc = 0; AccDataType v_acc = 0;
...@@ -117,9 +118,10 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base ...@@ -117,9 +118,10 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
AccDataType v_b; AccDataType v_b;
arg.a_element_op_( arg.a_element_op_(
v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, m2, k0))); v_a,
ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, m0, m1, m2, k0)));
arg.b_element_op_( arg.b_element_op_(
v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0))); v_b, ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, n0, n1, k0)));
v_acc += v_a * v_b; v_acc += v_a * v_b;
} }
...@@ -128,15 +130,16 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base ...@@ -128,15 +130,16 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
arg.cde_element_op_(v_c, v_acc); arg.cde_element_op_(v_c, v_acc);
arg.e_ms_ns_(m0, m1, m2, n0, n1) = v_c; arg.e_gs_ms_ns_(g0, m0, m1, m2, n0, n1) = v_c;
}; };
make_ParallelTensorFunctor(f_ms_ns, make_ParallelTensorFunctor(f_gs_ms_ns,
arg.e_ms_ns_.mDesc.GetLengths()[0], arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
arg.e_ms_ns_.mDesc.GetLengths()[1], arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
arg.e_ms_ns_.mDesc.GetLengths()[2], arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
arg.e_ms_ns_.mDesc.GetLengths()[3], arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
arg.e_ms_ns_.mDesc.GetLengths()[4])( arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
return 0; return 0;
...@@ -160,14 +163,15 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base ...@@ -160,14 +163,15 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
return true; return true;
} }
static auto MakeArgument(const Tensor<ADataType>& a_ms_ks, static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
const Tensor<BDataType>& b_ns_ks, const Tensor<BDataType>& b_gs_ns_ks,
Tensor<EDataType>& e_ms_ns, Tensor<EDataType>& e_gs_ms_ns,
AElementwiseOperation a_element_op, AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op, BElementwiseOperation b_element_op,
CDEElementwiseOperation cde_element_op) CDEElementwiseOperation cde_element_op)
{ {
return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op}; return Argument{
a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
} }
static auto MakeInvoker() { return Invoker{}; } static auto MakeInvoker() { return Invoker{}; }
...@@ -182,7 +186,7 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base ...@@ -182,7 +186,7 @@ struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::Base
auto str = std::stringstream(); auto str = std::stringstream();
// clang-format off // clang-format off
str << "ReferenceContraction_M3_N2_K1" str << "ReferenceContraction_G1_M3_N2_K1"
<< std::endl; << std::endl;
// clang-format on // clang-format on
...@@ -196,36 +200,33 @@ int main(int argc, char* argv[]) ...@@ -196,36 +200,33 @@ int main(int argc, char* argv[])
int init_method = 1; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
ck::index_t G0 = 1;
ck::index_t M0 = 4; ck::index_t M0 = 4;
ck::index_t M1 = 32; ck::index_t M1 = 8;
ck::index_t M2 = 128; ck::index_t M2 = 256;
ck::index_t N0 = 16; ck::index_t N0 = 32;
ck::index_t N1 = 256; ck::index_t N1 = 128;
ck::index_t K0 = 256; ck::index_t K0 = 1024;
// A[M0, M1, M2, K0] // A[M0, M1, M2, K0]
std::vector<ck::index_t> a_ms_ks_lengths{M0, M1, M2, K0}; std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, M0, M1, M2, K0};
std::vector<ck::index_t> a_ms_ks_strides{M1 * M2 * K0, M2 * K0, K0, 1}; std::vector<ck::index_t> a_gs_ms_ks_strides{M0 * M1 * M2 * K0, M1 * M2 * K0, M2 * K0, K0, 1};
// B[N0, N1, K0] // B[N0, N1, K0]
std::vector<ck::index_t> b_ns_ks_lengths{N0, N1, K0}; std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, N0, N1, K0};
std::vector<ck::index_t> b_ns_ks_strides{N1 * K0, K0, 1}; std::vector<ck::index_t> b_gs_ns_ks_strides{N0 * N1 * K0, N1 * K0, K0, 1};
#if 1
// D[M0, N0, M1, N1, M2]
std::vector<ck::index_t> d_ms_ns_lengths{M0, M1, M2, N0, N1};
std::vector<ck::index_t> d_ms_ns_strides{0, 0, 0, N1, 1};
// E[M0, N0, M1, N1, M2]
std::vector<ck::index_t> e_ms_ns_lengths{M0, M1, M2, N0, N1};
std::vector<ck::index_t> e_ms_ns_strides{N0 * M1 * N1 * M2, N1 * M2, 1, M1 * N1 * M2, M2};
#else
// D[M0, N0, M1, N1, M2] // D[M0, N0, M1, N1, M2]
std::vector<ck::index_t> d_ms_ns_lengths{M0, M1, M2, N0, N1}; std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, M0, M1, M2, N0, N1};
std::vector<ck::index_t> d_ms_ns_strides{0, 0, 0, N1, 1}; std::vector<ck::index_t> d_gs_ms_ns_strides{N0 * N1, 0, 0, 0, N1, 1};
// E[M0, N0, M1, N1, M2]
std::vector<ck::index_t> e_ms_ns_lengths{M0, M1, M2, N0, N1}; // E[M1, M0, N0, M1, N1]
std::vector<ck::index_t> e_ms_ns_strides{M1 * M2 * N0 * N1, M2 * N0 * N1, N0 * N1, N1, 1}; std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, M0, M1, M2, N0, N1};
#endif std::vector<ck::index_t> e_gs_ms_ns_strides{
M0 * M1 * M2 * N1 * N0, N0 * M1 * N1, N1, M0 * N0 * M1 * N1, M1 * N1, 1};
if(argc == 1) if(argc == 1)
{ {
...@@ -245,50 +246,51 @@ int main(int argc, char* argv[]) ...@@ -245,50 +246,51 @@ int main(int argc, char* argv[])
exit(0); exit(0);
} }
Tensor<ADataType> a_ms_ks( Tensor<ADataType> a_gs_ms_ks(
std::vector<std::size_t>(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()), std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
std::vector<std::size_t>(a_ms_ks_strides.begin(), a_ms_ks_strides.end())); std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
Tensor<BDataType> b_ns_ks( Tensor<BDataType> b_gs_ns_ks(
std::vector<std::size_t>(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()), std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
std::vector<std::size_t>(b_ns_ks_strides.begin(), b_ns_ks_strides.end())); std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
Tensor<DDataType> d_ms_ns( Tensor<DDataType> d_gs_ms_ns(
std::vector<std::size_t>(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()), std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(d_ms_ns_strides.begin(), d_ms_ns_strides.end())); std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
Tensor<EDataType> e_ms_ns_host_result( Tensor<EDataType> e_gs_ms_ns_host_result(
std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
Tensor<EDataType> e_ms_ns_device_result( Tensor<EDataType> e_gs_ms_ns_device_result(
std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl; std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl; std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl; std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl; std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
case 1: case 1:
a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}); a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break; break;
default: default:
a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break; break;
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize()); DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(sizeof(EDataType) *
e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_ms_ks.mData.data()); a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
b_device_buf.ToDevice(b_ns_ks.mData.data()); b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
d_device_buf.ToDevice(d_ms_ns.mData.data()); d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
// set zero // set zero
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -304,14 +306,14 @@ int main(int argc, char* argv[]) ...@@ -304,14 +306,14 @@ int main(int argc, char* argv[])
b_device_buf.GetDeviceBuffer(), b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()}, std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(), e_device_buf.GetDeviceBuffer(),
a_ms_ks_lengths, a_gs_ms_ks_lengths,
a_ms_ks_strides, a_gs_ms_ks_strides,
b_ns_ks_lengths, b_gs_ns_ks_lengths,
b_ns_ks_strides, b_gs_ns_ks_strides,
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths}, std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides}, std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
e_ms_ns_lengths, e_gs_ms_ns_lengths,
e_ms_ns_strides, e_gs_ms_ns_strides,
a_element_op, a_element_op,
b_element_op, b_element_op,
cde_element_op); cde_element_op);
...@@ -325,18 +327,18 @@ int main(int argc, char* argv[]) ...@@ -325,18 +327,18 @@ int main(int argc, char* argv[])
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
ck::index_t M = std::accumulate(e_ms_ns_lengths.begin(), ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin(),
e_ms_ns_lengths.begin() + NumDimM, e_gs_ms_ns_lengths.begin() + NumDimM,
ck::index_t{1}, ck::index_t{1},
std::multiplies<ck::index_t>{}); std::multiplies<ck::index_t>{});
ck::index_t N = std::accumulate(e_ms_ns_lengths.begin() + NumDimM, ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimM,
e_ms_ns_lengths.begin() + NumDimM + NumDimN, e_gs_ms_ns_lengths.begin() + NumDimM + NumDimN,
ck::index_t{1}, ck::index_t{1},
std::multiplies<ck::index_t>{}); std::multiplies<ck::index_t>{});
ck::index_t K = std::accumulate(a_ms_ks_lengths.begin() + NumDimM, ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimM,
a_ms_ks_lengths.begin() + NumDimM + NumDimK, a_gs_ms_ks_lengths.begin() + NumDimM + NumDimK,
ck::index_t{1}, ck::index_t{1},
std::multiplies<ck::index_t>{}); std::multiplies<ck::index_t>{});
...@@ -351,15 +353,16 @@ int main(int argc, char* argv[]) ...@@ -351,15 +353,16 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< op.GetTypeString() << std::endl; << op.GetTypeString() << std::endl;
e_device_buf.FromDevice(e_ms_ns_device_result.mData.data()); e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
if(do_verification) if(do_verification)
{ {
Tensor<CShuffleDataType> c_ms_ns_host_result( Tensor<CShuffleDataType> c_gs_ms_ns_host_result(
std::vector<std::size_t>(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
std::vector<std::size_t>(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
using ReferenceOpInstance = ReferenceContraction_M3_N2_K1<NumDimM, using ReferenceOpInstance = ReferenceContraction_G1_M3_N2_K1<NumDimG,
NumDimM,
NumDimN, NumDimN,
NumDimK, NumDimK,
ADataType, ADataType,
...@@ -373,31 +376,41 @@ int main(int argc, char* argv[]) ...@@ -373,31 +376,41 @@ int main(int argc, char* argv[])
auto ref_gemm = ReferenceOpInstance{}; auto ref_gemm = ReferenceOpInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument( auto ref_argument = ref_gemm.MakeArgument(a_gs_ms_ks,
a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{}); b_gs_ns_ks,
c_gs_ms_ns_host_result,
a_element_op,
b_element_op,
PassThrough{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0) for(size_t g0 = 0; g0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[0]; ++g0)
{ {
for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1) for(size_t m0 = 0; m0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[1]; ++m0)
{ {
for(size_t m2 = 0; m2 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++m2) for(size_t m1 = 0; m1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[2]; ++m1)
{ {
for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n0) for(size_t m2 = 0; m2 < e_gs_ms_ns_host_result.mDesc.GetLengths()[3]; ++m2)
{ {
for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[4]; ++n1) for(size_t n0 = 0; n0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[4]; ++n0)
{ {
cde_element_op(e_ms_ns_host_result(m0, m1, m2, n0, n1), for(size_t n1 = 0; n1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[5];
c_ms_ns_host_result(m0, m1, m2, n0, n1), ++n1)
d_ms_ns(m0, m1, m2, n0, n1)); {
cde_element_op(e_gs_ms_ns_host_result(g0, m0, m1, m2, n0, n1),
c_gs_ms_ns_host_result(g0, m0, m1, m2, n0, n1),
d_gs_ms_ns(g0, m0, m1, m2, n0, n1));
}
} }
} }
} }
} }
} }
return ck::utils::check_err(e_ms_ns_device_result.mData, e_ms_ns_host_result.mData) ? 0 : 1; return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
? 0
: 1;
} }
return 0; return 0;
......
...@@ -46,7 +46,7 @@ using DeviceInstance = ck::tensor_operation::device::DeviceLayernorm<XDataType, ...@@ -46,7 +46,7 @@ using DeviceInstance = ck::tensor_operation::device::DeviceLayernorm<XDataType,
8, // SrcScalarPerVector 8, // SrcScalarPerVector
8, // GammaScalarPerVector 8, // GammaScalarPerVector
8, // BetaScalarPerVector 8, // BetaScalarPerVector
1>; // OutScalarPerVector 8>; // OutScalarPerVector
int main() int main()
{ {
......
add_example_executable(example_grouped_convnd_fwd_bias_relu_xdl_fp16 grouped_convnd_fwd_bias_relu_xdl_fp16.cpp)
target_link_libraries(example_grouped_convnd_fwd_bias_relu_xdl_fp16 PRIVATE utility)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 PRIVATE utility)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 PRIVATE utility)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 PRIVATE utility)
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 PRIVATE utility)
\ No newline at end of file
...@@ -12,17 +12,23 @@ ...@@ -12,17 +12,23 @@
# <left padding>, (ie LeftPy, LeftPx for 2D) # <left padding>, (ie LeftPy, LeftPx for 2D)
# <right padding>, (ie RightPy, RightPx for 2D) # <right padding>, (ie RightPy, RightPx for 2D)
bin/example_grouped_convnd_fwd_bias_relu_xdl_fp16 1 1 1 bin/example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 1 1 1
``` ```
Result (MI100) Result (MI100)
``` ```
in: dim 5, lengths {1, 128, 192, 71, 71}, strides {6912, 967872, 1, 13632, 192} in: dim 5, lengths {2, 128, 192, 71, 71}, strides {192, 1935744, 1, 27264, 384}
wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {192, 1728, 1, 576, 192} wei: dim 5, lengths {2, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0} bias: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256} residual: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} out: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 663552, 1, 18432, 512}
A[M, K]: {165888, 1728}
B[N, K]: {256, 1728}
Ds[M, N]: {165888, 256}
Ds[M, N]: {165888, 256}
E[M, N]: {165888, 256}
launch_and_time_kernel: grid_dim {2592, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time Warm up 1 time
Start running 10 times... Start running 10 times...
Perf: 1.19215 ms, 123.112 TFlops, 279.827 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default> Perf: 2.48075 ms, 118.325 TFlops, 268.946 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
``` ```
\ No newline at end of file
...@@ -28,18 +28,20 @@ void print_helper_msg() ...@@ -28,18 +28,20 @@ void print_helper_msg()
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial,
typename InDataType, typename InDataType,
typename WeiDataType, typename WeiDataType,
typename CShuffleDataType,
typename OutDataType, typename OutDataType,
typename InElementOp, typename InElementOp,
typename WeiElementOp, typename WeiElementOp,
typename OutElementOp, typename OutElementOp,
typename DeviceConvNDFwdInstance> typename DeviceConvNDFwdInstance>
int run_grouped_conv_fwd_bias(bool do_verification, int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
int init_method, int init_method,
bool time_kernel, bool time_kernel,
const ck::utils::conv::ConvParam& conv_param, const ck::utils::conv::ConvParam& conv_param,
const HostTensorDescriptor& in_g_n_c_wis_desc, const HostTensorDescriptor& in_g_n_c_wis_desc,
const HostTensorDescriptor& wei_g_k_c_xs_desc, const HostTensorDescriptor& wei_g_k_c_xs_desc,
const HostTensorDescriptor& bias_g_n_k_wos_desc, const HostTensorDescriptor& bias_g_n_k_wos_desc,
const HostTensorDescriptor& residual_g_n_k_wos_desc,
const HostTensorDescriptor& out_g_n_k_wos_desc, const HostTensorDescriptor& out_g_n_k_wos_desc,
const InElementOp& in_element_op, const InElementOp& in_element_op,
const WeiElementOp& wei_element_op, const WeiElementOp& wei_element_op,
...@@ -48,12 +50,14 @@ int run_grouped_conv_fwd_bias(bool do_verification, ...@@ -48,12 +50,14 @@ int run_grouped_conv_fwd_bias(bool do_verification,
Tensor<InDataType> in(in_g_n_c_wis_desc); Tensor<InDataType> in(in_g_n_c_wis_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc); Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<OutDataType> bias(bias_g_n_k_wos_desc); Tensor<OutDataType> bias(bias_g_n_k_wos_desc);
Tensor<OutDataType> residual(residual_g_n_k_wos_desc);
Tensor<OutDataType> out_host(out_g_n_k_wos_desc); Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutDataType> out_device(out_g_n_k_wos_desc); Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
std::cout << "in: " << in.mDesc << std::endl; std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl; std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "bias: " << bias.mDesc << std::endl; std::cout << "bias: " << bias.mDesc << std::endl;
std::cout << "residual: " << residual.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl; std::cout << "out: " << out_host.mDesc << std::endl;
switch(init_method) switch(init_method)
...@@ -73,18 +77,22 @@ int run_grouped_conv_fwd_bias(bool do_verification, ...@@ -73,18 +77,22 @@ int run_grouped_conv_fwd_bias(bool do_verification,
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize()); DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
DeviceMem residual_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in.mData.data()); in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data()); wei_device_buf.ToDevice(wei.mData.data());
bias_device_buf.ToDevice(bias.mData.data()); bias_device_buf.ToDevice(bias.mData.data());
residual_device_buf.ToDevice(residual.mData.data());
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{}; std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{}; std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{}; std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{}; std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{}; std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{}; std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{}; std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{}; std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}; std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
...@@ -98,8 +106,10 @@ int run_grouped_conv_fwd_bias(bool do_verification, ...@@ -98,8 +106,10 @@ int run_grouped_conv_fwd_bias(bool do_verification,
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(bias_g_n_k_wos_desc.GetLengths(), d_g_n_k_wos_lengths); copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths);
copy(bias_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides); copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides);
copy(residual_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
copy(residual_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides); copy(conv_param.conv_filter_strides_, conv_filter_strides);
...@@ -110,17 +120,20 @@ int run_grouped_conv_fwd_bias(bool do_verification, ...@@ -110,17 +120,20 @@ int run_grouped_conv_fwd_bias(bool do_verification,
// do Conv // do Conv
auto conv = DeviceConvNDFwdInstance{}; auto conv = DeviceConvNDFwdInstance{};
auto invoker = conv.MakeInvoker(); auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument( auto argument =
in_device_buf.GetDeviceBuffer(), conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(), wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()}, std::array<const void*, 2>{bias_device_buf.GetDeviceBuffer(),
residual_device_buf.GetDeviceBuffer()},
out_device_buf.GetDeviceBuffer(), out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths, a_g_n_c_wis_lengths,
a_g_n_c_wis_strides, a_g_n_c_wis_strides,
b_g_k_c_xs_lengths, b_g_k_c_xs_lengths,
b_g_k_c_xs_strides, b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_lengths}}, std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_strides}}, {d0_g_n_k_wos_lengths, d1_g_n_k_wos_lengths}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
{d0_g_n_k_wos_strides, d1_g_n_k_wos_strides}},
e_g_n_k_wos_lengths, e_g_n_k_wos_lengths,
e_g_n_k_wos_strides, e_g_n_k_wos_strides,
conv_filter_strides, conv_filter_strides,
...@@ -152,12 +165,12 @@ int run_grouped_conv_fwd_bias(bool do_verification, ...@@ -152,12 +165,12 @@ int run_grouped_conv_fwd_bias(bool do_verification,
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
Tensor<OutDataType> c_host(out_g_n_k_wos_desc); Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial, auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, CShuffleDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
PassThrough>(); PassThrough>();
...@@ -177,8 +190,9 @@ int run_grouped_conv_fwd_bias(bool do_verification, ...@@ -177,8 +190,9 @@ int run_grouped_conv_fwd_bias(bool do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
// TODO: implement elementwise operation for host // TODO: implement elementwise operation for host
out_host.ForEach( out_host.ForEach([&](auto&, auto idx) {
[&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); }); out_element_op(out_host(idx), c_host(idx), bias(idx), residual(idx));
});
out_device_buf.FromDevice(out_device.mData.data()); out_device_buf.FromDevice(out_device.mData.data());
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = ck::bhalf_t;
using WeiDataType = ck::bhalf_t;
using AccDataType = float;
using CShuffleDataType = float;
using BiasDataType = ck::bhalf_t;
using ResidualDataType = ck::bhalf_t;
using OutDataType = ck::bhalf_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename BiasLayout,
typename ResidualLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout, ResidualLayout>,
OutLayout,
InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasDataType, ResidualDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 32, 1, 8>,
8>;
int main(int argc, char* argv[])
{
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// conventional group conv definition
// G = 2
// [N, C, Hi, Wi] = [128, 384, 71, 71]
// [K, C, Y, X] = [512, 192, 3, 3]
// [N, K, Ho, Wo] = [128, 512, 36, 36]
// CK group conv definition
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
ck::utils::conv::ConvParam conv_param{
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(conv_param.num_dim_spatial_ == 1)
{
using InLayout = ctc::G_NW_C;
using WeiLayout = ctc::G_K_X_C;
using BiasLayout = ctc::G_NW_K;
using ResidualLayout = ctc::G_NW_K;
using OutLayout = ctc::G_NW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<1,
InDataType,
WeiDataType,
CShuffleDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<1,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
using InLayout = ctc::G_NHW_C;
using WeiLayout = ctc::G_K_YX_C;
using BiasLayout = ctc::G_NHW_K;
using ResidualLayout = ctc::G_NHW_K;
using OutLayout = ctc::G_NHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<2,
InDataType,
WeiDataType,
CShuffleDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<2,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
using InLayout = ctc::G_NDHW_C;
using WeiLayout = ctc::G_K_ZYX_C;
using BiasLayout = ctc::G_NDHW_K;
using ResidualLayout = ctc::G_NDHW_K;
using OutLayout = ctc::G_NDHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1],
conv_param.input_spatial_lengths_[2]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
conv_param.G_ * conv_param.C_, // di
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1],
conv_param.filter_spatial_lengths_[2]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // z
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
conv_param.G_ * conv_param.K_, // do
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<3,
InDataType,
WeiDataType,
CShuffleDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<3,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;
}
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_convnd_fwd_bias_common.hpp" #include "grouped_convnd_fwd_bias_relu_add_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
...@@ -12,6 +12,7 @@ using WeiDataType = ck::half_t; ...@@ -12,6 +12,7 @@ using WeiDataType = ck::half_t;
using AccDataType = float; using AccDataType = float;
using CShuffleDataType = ck::half_t; using CShuffleDataType = ck::half_t;
using BiasDataType = ck::half_t; using BiasDataType = ck::half_t;
using ResidualDataType = ck::half_t;
using OutDataType = ck::half_t; using OutDataType = ck::half_t;
template <ck::index_t... Is> template <ck::index_t... Is>
...@@ -19,31 +20,31 @@ using S = ck::Sequence<Is...>; ...@@ -19,31 +20,31 @@ using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough; using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu; using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvSpec = static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
#if 1
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial,
typename InLayout, typename InLayout,
typename WeiLayout, typename WeiLayout,
typename BiasLayout, typename BiasLayout,
typename ResidualLayout,
typename OutLayout> typename OutLayout>
using DeviceGroupedConvNDFwdInstance = using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial, NDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<BiasLayout>, ck::Tuple<BiasLayout, ResidualLayout>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
AccDataType, AccDataType,
CShuffleDataType, CShuffleDataType,
ck::Tuple<BiasDataType>, ck::Tuple<BiasDataType, ResidualDataType>,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
...@@ -79,60 +80,6 @@ using DeviceGroupedConvNDFwdInstance = ...@@ -79,60 +80,6 @@ using DeviceGroupedConvNDFwdInstance =
1, 1,
S<1, 32, 1, 8>, S<1, 32, 1, 8>,
8>; 8>;
#else
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename BiasLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout>,
OutLayout,
InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
256, // MPerBlock
16, // NPerBlock
32, // KPerBlock
8, // AK1
8, // BK1
16, // MPerXdl
16, // NPerXdl
4, // MXdlPerWave
1, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 16, 4>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
2, // BBlockTransferSrcScalarPerVector
2, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
4, // CShuffleMXdlPerWavePerShuffle
1, // CShuffleNXdlPerWavePerShuffle
S<1, 256, 1, 1>,
1>;
#endif
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
...@@ -185,6 +132,7 @@ int main(int argc, char* argv[]) ...@@ -185,6 +132,7 @@ int main(int argc, char* argv[])
using InLayout = ctc::G_NW_C; using InLayout = ctc::G_NW_C;
using WeiLayout = ctc::G_K_X_C; using WeiLayout = ctc::G_K_X_C;
using BiasLayout = ctc::G_NW_K; using BiasLayout = ctc::G_NW_K;
using ResidualLayout = ctc::G_NW_K;
using OutLayout = ctc::G_NW_K; using OutLayout = ctc::G_NW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor( const auto in_g_n_c_wis_desc = HostTensorDescriptor(
...@@ -214,6 +162,15 @@ int main(int argc, char* argv[]) ...@@ -214,6 +162,15 @@ int main(int argc, char* argv[])
0 // x 0 // x
}); });
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor( const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{ {
...@@ -223,15 +180,20 @@ int main(int argc, char* argv[]) ...@@ -223,15 +180,20 @@ int main(int argc, char* argv[])
conv_param.G_ * conv_param.K_ // wo conv_param.G_ * conv_param.K_ // wo
}); });
return run_grouped_conv_fwd_bias< return run_grouped_conv_fwd_bias_relu_add<1,
1,
InDataType, InDataType,
WeiDataType, WeiDataType,
CShuffleDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceGroupedConvNDFwdInstance<1, InLayout, WeiLayout, BiasLayout, OutLayout>>( DeviceGroupedConvNDFwdInstance<1,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification, do_verification,
init_method, init_method,
time_kernel, time_kernel,
...@@ -239,6 +201,7 @@ int main(int argc, char* argv[]) ...@@ -239,6 +201,7 @@ int main(int argc, char* argv[])
in_g_n_c_wis_desc, in_g_n_c_wis_desc,
wei_g_k_c_xs_desc, wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc, bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc, out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
...@@ -249,6 +212,7 @@ int main(int argc, char* argv[]) ...@@ -249,6 +212,7 @@ int main(int argc, char* argv[])
using InLayout = ctc::G_NHW_C; using InLayout = ctc::G_NHW_C;
using WeiLayout = ctc::G_K_YX_C; using WeiLayout = ctc::G_K_YX_C;
using BiasLayout = ctc::G_NHW_K; using BiasLayout = ctc::G_NHW_K;
using ResidualLayout = ctc::G_NHW_K;
using OutLayout = ctc::G_NHW_K; using OutLayout = ctc::G_NHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor( const auto in_g_n_c_wis_desc = HostTensorDescriptor(
...@@ -296,6 +260,20 @@ int main(int argc, char* argv[]) ...@@ -296,6 +260,20 @@ int main(int argc, char* argv[])
0 // wo 0 // wo
}); });
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor( const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, {conv_param.G_,
conv_param.N_, conv_param.N_,
...@@ -311,15 +289,20 @@ int main(int argc, char* argv[]) ...@@ -311,15 +289,20 @@ int main(int argc, char* argv[])
conv_param.G_ * conv_param.K_ // wo conv_param.G_ * conv_param.K_ // wo
}); });
return run_grouped_conv_fwd_bias< return run_grouped_conv_fwd_bias_relu_add<2,
2,
InDataType, InDataType,
WeiDataType, WeiDataType,
CShuffleDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceGroupedConvNDFwdInstance<2, InLayout, WeiLayout, BiasLayout, OutLayout>>( DeviceGroupedConvNDFwdInstance<2,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification, do_verification,
init_method, init_method,
time_kernel, time_kernel,
...@@ -327,6 +310,7 @@ int main(int argc, char* argv[]) ...@@ -327,6 +310,7 @@ int main(int argc, char* argv[])
in_g_n_c_wis_desc, in_g_n_c_wis_desc,
wei_g_k_c_xs_desc, wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc, bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc, out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
...@@ -337,6 +321,7 @@ int main(int argc, char* argv[]) ...@@ -337,6 +321,7 @@ int main(int argc, char* argv[])
using InLayout = ctc::G_NDHW_C; using InLayout = ctc::G_NDHW_C;
using WeiLayout = ctc::G_K_ZYX_C; using WeiLayout = ctc::G_K_ZYX_C;
using BiasLayout = ctc::G_NDHW_K; using BiasLayout = ctc::G_NDHW_K;
using ResidualLayout = ctc::G_NDHW_K;
using OutLayout = ctc::G_NDHW_K; using OutLayout = ctc::G_NDHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor( const auto in_g_n_c_wis_desc = HostTensorDescriptor(
...@@ -393,6 +378,22 @@ int main(int argc, char* argv[]) ...@@ -393,6 +378,22 @@ int main(int argc, char* argv[])
0 // x 0 // x
}); });
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor( const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, {conv_param.G_,
conv_param.N_, conv_param.N_,
...@@ -411,15 +412,20 @@ int main(int argc, char* argv[]) ...@@ -411,15 +412,20 @@ int main(int argc, char* argv[])
conv_param.G_ * conv_param.K_ // wo conv_param.G_ * conv_param.K_ // wo
}); });
return run_grouped_conv_fwd_bias< return run_grouped_conv_fwd_bias_relu_add<3,
3,
InDataType, InDataType,
WeiDataType, WeiDataType,
CShuffleDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp, OutElementOp,
DeviceGroupedConvNDFwdInstance<3, InLayout, WeiLayout, BiasLayout, OutLayout>>( DeviceGroupedConvNDFwdInstance<3,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification, do_verification,
init_method, init_method,
time_kernel, time_kernel,
...@@ -427,6 +433,7 @@ int main(int argc, char* argv[]) ...@@ -427,6 +433,7 @@ int main(int argc, char* argv[])
in_g_n_c_wis_desc, in_g_n_c_wis_desc,
wei_g_k_c_xs_desc, wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc, bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc, out_g_n_k_wos_desc,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = float;
using WeiDataType = float;
using AccDataType = float;
using CShuffleDataType = float;
using BiasDataType = float;
using ResidualDataType = float;
using OutDataType = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename BiasLayout,
typename ResidualLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout, ResidualLayout>,
OutLayout,
InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasDataType, ResidualDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
16, // KPerBlock
4, // AK1
4, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
4, // ABlockTransferSrcScalarPerVector
4, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
4, // BBlockTransferSrcScalarPerVector
4, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 16, 1, 16>,
4>;
int main(int argc, char* argv[])
{
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// conventional group conv definition
// G = 2
// [N, C, Hi, Wi] = [128, 384, 71, 71]
// [K, C, Y, X] = [512, 192, 3, 3]
// [N, K, Ho, Wo] = [128, 512, 36, 36]
// CK group conv definition
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
ck::utils::conv::ConvParam conv_param{
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(conv_param.num_dim_spatial_ == 1)
{
using InLayout = ctc::G_NW_C;
using WeiLayout = ctc::G_K_X_C;
using BiasLayout = ctc::G_NW_K;
using ResidualLayout = ctc::G_NW_K;
using OutLayout = ctc::G_NW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<1,
InDataType,
WeiDataType,
CShuffleDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<1,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
using InLayout = ctc::G_NHW_C;
using WeiLayout = ctc::G_K_YX_C;
using BiasLayout = ctc::G_NHW_K;
using ResidualLayout = ctc::G_NHW_K;
using OutLayout = ctc::G_NHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<2,
InDataType,
WeiDataType,
CShuffleDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<2,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
using InLayout = ctc::G_NDHW_C;
using WeiLayout = ctc::G_K_ZYX_C;
using BiasLayout = ctc::G_NDHW_K;
using ResidualLayout = ctc::G_NDHW_K;
using OutLayout = ctc::G_NDHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1],
conv_param.input_spatial_lengths_[2]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
conv_param.G_ * conv_param.C_, // di
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1],
conv_param.filter_spatial_lengths_[2]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // z
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
conv_param.G_ * conv_param.K_, // do
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<3,
InDataType,
WeiDataType,
CShuffleDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<3,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = int8_t;
using WeiDataType = int8_t;
using AccDataType = int32_t;
using CShuffleDataType = int8_t;
using BiasDataType = int8_t;
using ResidualDataType = int8_t;
using OutDataType = int8_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename BiasLayout,
typename ResidualLayout,
typename OutLayout>
using DeviceGroupedConvNDFwdInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<BiasLayout, ResidualLayout>,
OutLayout,
InDataType,
WeiDataType,
AccDataType,
CShuffleDataType,
ck::Tuple<BiasDataType, ResidualDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ConvSpec, // ConvForwardSpecialization
GemmSpec, // GemmSpecialization
1, //
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
64, // KPerBlock
16, // AK1
16, // BK1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
16, // ABlockTransferSrcScalarPerVector
16, // ABlockTransferDstScalarPerVector_AK1
1, // ABlockLdsExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
16, // BBlockTransferSrcScalarPerVector
16, // BBlockTransferDstScalarPerVector_BK1
1, // BBlockLdsExtraN
1,
1,
S<1, 64, 1, 4>,
16>;
int main(int argc, char* argv[])
{
namespace ctc = ck::tensor_layout::convolution;
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// conventional group conv definition
// G = 2
// [N, C, Hi, Wi] = [128, 384, 71, 71]
// [K, C, Y, X] = [512, 192, 3, 3]
// [N, K, Ho, Wo] = [128, 512, 36, 36]
// CK group conv definition
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
ck::utils::conv::ConvParam conv_param{
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
if(conv_param.num_dim_spatial_ == 1)
{
using InLayout = ctc::G_NW_C;
using WeiLayout = ctc::G_K_X_C;
using BiasLayout = ctc::G_NW_K;
using ResidualLayout = ctc::G_NW_K;
using OutLayout = ctc::G_NW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
1, // c
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
0, // k
1, // c
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<1,
InDataType,
WeiDataType,
CShuffleDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<1,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 2)
{
using InLayout = ctc::G_NHW_C;
using WeiLayout = ctc::G_K_YX_C;
using BiasLayout = ctc::G_NHW_K;
using ResidualLayout = ctc::G_NHW_K;
using OutLayout = ctc::G_NHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
0, // n
1, // k
0, // ho
0 // wo
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<2,
InDataType,
WeiDataType,
CShuffleDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<2,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
else if(conv_param.num_dim_spatial_ == 3)
{
using InLayout = ctc::G_NDHW_C;
using WeiLayout = ctc::G_K_ZYX_C;
using BiasLayout = ctc::G_NDHW_K;
using ResidualLayout = ctc::G_NDHW_K;
using OutLayout = ctc::G_NDHW_K;
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.C_,
conv_param.input_spatial_lengths_[0],
conv_param.input_spatial_lengths_[1],
conv_param.input_spatial_lengths_[2]},
{
conv_param.C_, // g
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
1, // c
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
conv_param.G_ * conv_param.C_, // di
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
conv_param.G_ * conv_param.C_ // wi
});
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.K_,
conv_param.C_,
conv_param.filter_spatial_lengths_[0],
conv_param.filter_spatial_lengths_[1],
conv_param.filter_spatial_lengths_[2]},
{
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // g
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
1, // c
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
conv_param.C_, // z
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
conv_param.C_ // x
});
const auto bias_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto residual_g_n_k_wos_desc =
HostTensorDescriptor({conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
0, // n
1, // k
0, // z
0, // y
0 // x
});
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
{conv_param.G_,
conv_param.N_,
conv_param.K_,
conv_param.output_spatial_lengths_[0],
conv_param.output_spatial_lengths_[1],
conv_param.output_spatial_lengths_[2]},
{
conv_param.K_, // g
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
1, // k
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
conv_param.G_ * conv_param.K_, // do
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
conv_param.G_ * conv_param.K_ // wo
});
return run_grouped_conv_fwd_bias_relu_add<3,
InDataType,
WeiDataType,
CShuffleDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<3,
InLayout,
WeiLayout,
BiasLayout,
ResidualLayout,
OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
bias_g_n_k_wos_desc,
residual_g_n_k_wos_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
}
return 0;
}
# TODO: add example batched_gemm_gemm_xdl_fp16
add_example_executable(example_batched_gemm_softmax_gemm_xdl_fp16 batched_gemm_softmax_gemm_xdl_fp16.cpp)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment