diff --git a/3rdparty/composable_kernel/.clang-format b/3rdparty/composable_kernel/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..22f26749668ad3dfb79cc76151847663d11da7d0
--- /dev/null
+++ b/3rdparty/composable_kernel/.clang-format
@@ -0,0 +1,90 @@
+---
+Language: Cpp
+AccessModifierOffset: 0
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+ AfterClass: true
+ AfterControlStatement: true
+ AfterEnum: true
+ AfterFunction: true
+ AfterNamespace: false
+ AfterObjCDeclaration: true
+ AfterStruct: true
+ AfterUnion: true
+ BeforeCatch: true
+ BeforeElse: true
+ IndentBraces: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories:
+ - Regex: '^"(llvm|llvm-c|clang|clang-c)/'
+ Priority: 2
+ - Regex: '^(<|"(gtest|isl|json)/)'
+ Priority: 3
+ - Regex: '.*'
+ Priority: 1
+IndentCaseLabels: false
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: false
+SpaceAfterCStyleCast: false
+# SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: Never
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+TabWidth: 8
+UseTab: Never
+...
+
diff --git a/3rdparty/composable_kernel/.clang-tidy b/3rdparty/composable_kernel/.clang-tidy
new file mode 100644
index 0000000000000000000000000000000000000000..5c2b78168747787b506b0cd6e72af595eb270670
--- /dev/null
+++ b/3rdparty/composable_kernel/.clang-tidy
@@ -0,0 +1,3 @@
+CheckOptions:
+ - key: bugprone-reserved-identifier.AllowedIdentifiers
+ value: '__HIP_PLATFORM_HCC__;__HIP_ROCclr__'
diff --git a/3rdparty/composable_kernel/.gitignore b/3rdparty/composable_kernel/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..71059ec4d948ed241b76ea7ebf9c79136cfcbf74
--- /dev/null
+++ b/3rdparty/composable_kernel/.gitignore
@@ -0,0 +1,49 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+*.ipch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# vim tags
+tags
+.tags
+.*.swp
+
+# Editors
+.vscode
+
+# build-in-source directory
+build*
+
+# emacs temporary/backup files
+.\#*
+\#*\#
+*~
+
+# GDB temporary files
+.gdb_history
+install.dir*
diff --git a/3rdparty/composable_kernel/CITATION.cff b/3rdparty/composable_kernel/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..d35fe9e5870f0a538c1775efd239dc729e11ab88
--- /dev/null
+++ b/3rdparty/composable_kernel/CITATION.cff
@@ -0,0 +1,67 @@
+cff-version: 1.2.0
+title: Composable Kernel
+message: If you use this software, please cite using the following metadata.
+type: software
+authors:
+ - given-names: Chao
+ family-names: Liu
+ email: chao.liu2@amd.com
+ affiliation: AMD
+ - given-names: Jing
+ family-names: Zhang
+ email: jing.zhang3@amd.com
+ affiliation: AMD
+ - given-names: Letao
+ family-names: Qin
+ email: letao.qin@amd.com
+ affiliation: AMD
+ - given-names: Qianfeng
+ family-names: Zhang
+ email: qianfeng.zhang@amd.com
+ affiliation: AMD
+ - given-names: Liang
+ family-names: Huang
+ email: carlus.huang@amd.com
+ affiliation: AMD
+ - given-names: Shaojie
+ family-names: Wang
+ email: shaojie.wang@amd.com
+ affiliation: AMD
+ - given-names: Anthony
+ family-names: Chang
+ email: antc@amd.com
+ affiliation: AMD
+ - given-names: Chunyu
+ family-names: Lai
+ email: chunyu.lai@amd.com
+ affiliation: AMD
+ - given-names: Illia
+ family-names: Silin
+ email: illia.silin@amd.com
+ affiliation: AMD
+ - given-names: Adam
+ family-names: Osewski
+ email: adam.osewski@amd.com
+ affiliation: AMD
+ - given-names: Poyen
+ family-names: Chen
+ email: poyen.chen@amd.com
+ affiliation: AMD
+ - given-names: Rosty
+ family-names: Geyyer
+ email: rosty.geyyer@amd.com
+ affiliation: AMD
+ - given-names: Hanwen
+ family-names: Chen
+ - given-names: Tejash
+ family-names: Shah
+ - given-names: Xiaoyan
+ family-names: Zhou
+ - given-names: Jianfeng
+ family-names: Yan
+repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel'
+abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
+keywords:
+ - 'CK, Composable Kernel, Tensor Coordinate Transformation'
+license: MIT
+license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
diff --git a/3rdparty/composable_kernel/CMakeLists.txt b/3rdparty/composable_kernel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..53e58890b8bdc0c96a2f1b801a40fc44d735b73a
--- /dev/null
+++ b/3rdparty/composable_kernel/CMakeLists.txt
@@ -0,0 +1,311 @@
+cmake_minimum_required(VERSION 3.14)
+
+# Check support for CUDA/HIP in Cmake
+project(composable_kernel)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+
+enable_testing()
+set(ROCM_PATH $ENV{ROCM_PATH})
+set(ROCM_SYMLINK_LIBS OFF)
+find_package(ROCM REQUIRED PATHS ${ROCM_PATH})
+
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMSetupVersion)
+include(ROCMInstallSymlinks)
+include(ROCMCreatePackage)
+include(CheckCXXCompilerFlag)
+
+rocm_setup_version(VERSION 0.2.0)
+include(TargetFlags)
+list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip ${ROCM_PATH} ${ROCM_PATH}/llvm ${ROCM_PATH}/hip)
+
+option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
+
+if(USE_BITINT_EXTENSION_INT4)
+ add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+ add_compile_options(-Wno-bit-int-extension)
+ message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
+endif()
+
+## Threads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+link_libraries(Threads::Threads)
+
+## C++
+enable_language(CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+
+## OpenMP
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ # workaround issue hipcc in rocm3.5 cannot find openmp
+ set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+ set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+ set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
+ set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+ set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+ set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+else()
+ find_package(OpenMP REQUIRED)
+endif()
+
+message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
+message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
+message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
+message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
+
+link_libraries(${OpenMP_gomp_LIBRARY})
+link_libraries(${OpenMP_pthread_LIBRARY})
+
+## HIP
+find_package(HIP REQUIRED)
+# Override HIP version in config.h, if necessary.
+# The variables set by find_package() can't be overwritten,
+# therefore let's use intermediate variables.
+set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}")
+set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}")
+set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}")
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR )
+ set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}")
+ message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}")
+endif()
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR )
+ set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}")
+ message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}")
+endif()
+if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
+ set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}")
+ message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
+endif()
+message(STATUS "Build with HIP ${HIP_VERSION}")
+link_libraries(hip::device)
+add_compile_definitions(__HIP_PLATFORM_HCC__=1)
+
+## tidy
+include(EnableCompilerWarnings)
+set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
+if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+ set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
+# Enable tidy on hip
+elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
+ set(CK_TIDY_ERRORS ALL)
+endif()
+
+
+include(ClangTidy)
+enable_clang_tidy(
+ CHECKS
+ *
+ -abseil-*
+ -android-cloexec-fopen
+ # Yea we shouldn't be using rand()
+ -cert-msc30-c
+ -bugprone-exception-escape
+ -bugprone-macro-parentheses
+ -cert-env33-c
+ -cert-msc32-c
+ -cert-msc50-cpp
+ -cert-msc51-cpp
+ -cert-dcl37-c
+ -cert-dcl51-cpp
+ -clang-analyzer-alpha.core.CastToStruct
+ -clang-analyzer-optin.performance.Padding
+ -clang-diagnostic-deprecated-declarations
+ -clang-diagnostic-extern-c-compat
+ -clang-diagnostic-unused-command-line-argument
+ -cppcoreguidelines-avoid-c-arrays
+ -cppcoreguidelines-avoid-magic-numbers
+ -cppcoreguidelines-explicit-virtual-functions
+ -cppcoreguidelines-init-variables
+ -cppcoreguidelines-macro-usage
+ -cppcoreguidelines-non-private-member-variables-in-classes
+ -cppcoreguidelines-pro-bounds-array-to-pointer-decay
+ -cppcoreguidelines-pro-bounds-constant-array-index
+ -cppcoreguidelines-pro-bounds-pointer-arithmetic
+ -cppcoreguidelines-pro-type-member-init
+ -cppcoreguidelines-pro-type-reinterpret-cast
+ -cppcoreguidelines-pro-type-union-access
+ -cppcoreguidelines-pro-type-vararg
+ -cppcoreguidelines-special-member-functions
+ -fuchsia-*
+ -google-explicit-constructor
+ -google-readability-braces-around-statements
+ -google-readability-todo
+ -google-runtime-int
+ -google-runtime-references
+ -hicpp-vararg
+ -hicpp-braces-around-statements
+ -hicpp-explicit-conversions
+ -hicpp-named-parameter
+ -hicpp-no-array-decay
+ # We really shouldn't use bitwise operators with signed integers, but
+ # opencl leaves us no choice
+ -hicpp-avoid-c-arrays
+ -hicpp-signed-bitwise
+ -hicpp-special-member-functions
+ -hicpp-uppercase-literal-suffix
+ -hicpp-use-auto
+ -hicpp-use-equals-default
+ -hicpp-use-override
+ -llvm-header-guard
+ -llvm-include-order
+ #-llvmlibc-*
+ -llvmlibc-restrict-system-libc-headers
+ -llvmlibc-callee-namespace
+ -llvmlibc-implementation-in-namespace
+ -llvm-else-after-return
+ -llvm-qualified-auto
+ -misc-misplaced-const
+ -misc-non-private-member-variables-in-classes
+ -misc-no-recursion
+ -modernize-avoid-bind
+ -modernize-avoid-c-arrays
+ -modernize-pass-by-value
+ -modernize-use-auto
+ -modernize-use-default-member-init
+ -modernize-use-equals-default
+ -modernize-use-trailing-return-type
+ -modernize-use-transparent-functors
+ -performance-unnecessary-value-param
+ -readability-braces-around-statements
+ -readability-else-after-return
+ # we are not ready to use it, but very useful
+ -readability-function-cognitive-complexity
+ -readability-isolate-declaration
+ -readability-magic-numbers
+ -readability-named-parameter
+ -readability-uppercase-literal-suffix
+ -readability-convert-member-functions-to-static
+ -readability-qualified-auto
+ -readability-redundant-string-init
+ # too many narrowing conversions in our code
+ -bugprone-narrowing-conversions
+ -cppcoreguidelines-narrowing-conversions
+ -altera-struct-pack-align
+ -cppcoreguidelines-prefer-member-initializer
+ ${CK_TIDY_CHECKS}
+ ${CK_TIDY_ERRORS}
+ HEADER_FILTER
+ "\.hpp$"
+ EXTRA_ARGS
+ -DCK_USE_CLANG_TIDY
+)
+
+include(CppCheck)
+enable_cppcheck(
+ CHECKS
+ warning
+ style
+ performance
+ portability
+ SUPPRESS
+ ConfigurationNotChecked
+ constStatement
+ duplicateCondition
+ noExplicitConstructor
+ passedByValue
+ preprocessorErrorDirective
+ shadowVariable
+ unusedFunction
+ unusedPrivateFunction
+ unusedStructMember
+ unmatchedSuppression
+ FORCE
+ SOURCES
+ library/src
+ INCLUDE
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ ${CMAKE_CURRENT_BINARY_DIR}/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/library/include
+ DEFINE
+ CPPCHECK=1
+ __linux__=1
+)
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
+
+include_directories(BEFORE
+ ${PROJECT_SOURCE_DIR}/include
+ ${PROJECT_SOURCE_DIR}/library/include
+ ${HIP_INCLUDE_DIRS}
+)
+
+
+SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
+if(BUILD_DEV)
+ add_compile_options(-Werror)
+ add_compile_options(-Weverything)
+endif()
+message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+
+file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
+file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
+set(CK_DEVICE_INSTANCES)
+FOREACH(subdir_path ${dir_list})
+ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
+ list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
+ ENDIF()
+ENDFOREACH()
+add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
+
+rocm_package_setup_component(tests
+ LIBRARY_NAME composablekernel
+ PACKAGE_NAME tests # Prevent -static suffix on package name
+)
+
+rocm_package_setup_component(examples
+ LIBRARY_NAME composablekernel
+ PACKAGE_NAME examples
+)
+
+rocm_package_setup_component(profiler
+ LIBRARY_NAME composablekernel
+ PACKAGE_NAME ckProfiler
+)
+
+add_subdirectory(library)
+add_subdirectory(example)
+add_subdirectory(test)
+add_subdirectory(profiler)
+
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+
+set(version 1.0.0)
+write_basic_package_version_file(
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ VERSION "${version}"
+ COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+ NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+
+rocm_install(FILES
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+ "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
+
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+set(CPACK_RPM_PACKAGE_LICENSE "MIT")
+
+rocm_create_package(
+ NAME composablekernel
+ DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+ MAINTAINER "MIOpen Kernels Dev Team
"
+ LDCONFIG
+ HEADER_ONLY
+)
diff --git a/3rdparty/composable_kernel/CONTRIBUTORS.md b/3rdparty/composable_kernel/CONTRIBUTORS.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ccfe99c3cc73b643f8b92cb654005e54c0774bd
--- /dev/null
+++ b/3rdparty/composable_kernel/CONTRIBUTORS.md
@@ -0,0 +1,31 @@
+# Composable Kernel Developers and Contributors
+
+This is the list of developers and contributors to Composable Kernel library
+
+
+## Developers
+[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2022
+
+[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022
+
+[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), 2022
+
+Hanwen Chang, 2019-2021,
+
+Tejash Shah, 2019-2020
+
+Xiaoyan Zhou, 2020
+
+[Jianfeng Yan](https://github.com/j4yan), 2021-2022
+
+
+## Product Manager
+[Jun Liu](https://github.com/junliume)
+
+
+## Contributors
+[Dan Yao](https://github.com/danyao12), [Guangzhao Lu](https://github.com/guangzlu), [Raman Jana](https://github.com/ramjana), [Jehandad Khan](https://github.com/JehandadKhan), [Wen-Heng (Jack) Chung](https://github.com/whchung)
+
+
+## Acknowledgement
+CK team works closely with Meta [AITemplate](https://github.com/facebookincubator/AITemplate) team ([Bing Xu](https://github.com/antinucleon), [Hao Lu](https://github.com/hlu1), [Ying Zhang](https://github.com/ipiszy), etc). Most of the lucrative graph optimization opportunities in ML models were identified by AITemplate team, and we also co-designed many high performance fused kernels for AMD GPUs. Without this collaboration, CK would not reach its current potential.
diff --git a/3rdparty/composable_kernel/Config.cmake.in b/3rdparty/composable_kernel/Config.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..02978cd4dd49f53188080f1a0404cf87bfd52d7b
--- /dev/null
+++ b/3rdparty/composable_kernel/Config.cmake.in
@@ -0,0 +1,11 @@
+@PACKAGE_INIT@
+
+set(_composable_kernel_supported_components device_operations utility)
+
+foreach(_comp ${composable_kernel_FIND_COMPONENTS})
+ if(NOT _comp IN_LIST _composable_kernel_supported_components)
+ set(composable_kernel_FOUND False)
+ set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
+ endif()
+ include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
+endforeach()
diff --git a/3rdparty/composable_kernel/Dockerfile b/3rdparty/composable_kernel/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..d024f966c57fac8ced14abbb00db7cc8f20c7d63
--- /dev/null
+++ b/3rdparty/composable_kernel/Dockerfile
@@ -0,0 +1,110 @@
+FROM ubuntu:20.04
+
+ARG ROCMVERSION=5.3
+ARG compiler_version="release"
+ARG compiler_commit=""
+
+RUN set -xe
+
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
+# Add rocm repository
+RUN apt-get update
+RUN apt-get install -y wget gnupg
+RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
+RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
+RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
+
+# Install dependencies
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+ apt-utils \
+ build-essential \
+ ccache \
+ cmake-data \
+ cmake \
+ curl \
+ git \
+ hip-rocclr \
+ jq \
+ libelf-dev \
+ libncurses5-dev \
+ libnuma-dev \
+ libpthread-stubs0-dev \
+ llvm-amdgpu \
+ pkg-config \
+ python \
+ python3 \
+ python-dev \
+ python3-dev \
+ python3-pip \
+ software-properties-common \
+ rocm-dev \
+ rocm-device-libs \
+ rocm-cmake \
+ vim \
+ zlib1g-dev \
+ openssh-server \
+ clang-format-10 \
+ kmod && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+
+# Setup ubsan environment to printstacktrace
+RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
+ENV UBSAN_OPTIONS=print_stacktrace=1
+
+# Install an init system
+RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
+RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
+
+ARG PREFIX=/opt/rocm
+# Install packages for processing the performance results
+RUN pip3 install --upgrade pip
+RUN pip3 install sqlalchemy
+RUN pip3 install pymysql
+RUN pip3 install pandas
+RUN pip3 install setuptools-rust
+RUN pip3 install sshtunnel
+# Setup ubsan environment to printstacktrace
+ENV UBSAN_OPTIONS=print_stacktrace=1
+
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+RUN groupadd -f render
+
+# Install the new rocm-cmake version
+RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git && \
+ cd rocm-cmake && mkdir build && cd build && \
+ cmake .. && cmake --build . && cmake --build . --target install
+
+WORKDIR /
+
+ENV compiler_version=$compiler_version
+ENV compiler_commit=$compiler_commit
+RUN sh -c "echo compiler version = '$compiler_version'"
+RUN sh -c "echo compiler commit = '$compiler_commit'"
+
+RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \
+ sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \
+ sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \
+ fi
+
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \
+ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
+ cd llvm-project && mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+ make -j 8 ; \
+ else echo "using the release compiler"; \
+ fi
+
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \
+ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
+ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+ make -j 8 ; \
+ else echo "using the release compiler"; \
+ fi
+
+
+#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
+#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
diff --git a/3rdparty/composable_kernel/Jenkinsfile b/3rdparty/composable_kernel/Jenkinsfile
new file mode 100644
index 0000000000000000000000000000000000000000..7b2e57c1403df3cd4574bf127b96163f1b736d17
--- /dev/null
+++ b/3rdparty/composable_kernel/Jenkinsfile
@@ -0,0 +1,705 @@
+def rocmnode(name) {
+ return 'rocmtest && miopen && ' + name
+}
+
+def show_node_info() {
+ sh """
+ echo "NODE_NAME = \$NODE_NAME"
+ lsb_release -sd
+ uname -r
+ ls /opt/ -la
+ """
+}
+
+def runShell(String command){
+ def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
+ def output = readFile(file: "tmp.txt")
+ echo "tmp.txt contents: $output"
+ return (output != "")
+}
+
+def getDockerImageName(){
+ def img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+ return img
+}
+
+def check_host() {
+ if ("${env.CK_CCACHE}" != "null"){
+ def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}"
+ echo "ccache server: ${CCACHE_SERVER}"
+ sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt'''
+ def output = readFile(file: "tmp.txt")
+ echo "tmp.txt contents: \$output"
+ return (output != "0")
+ }
+ else{
+ return 1
+ }
+}
+
+def build_compiler(){
+ def compiler
+ if (params.BUILD_COMPILER == "hipcc"){
+ compiler = '/opt/rocm/bin/hipcc'
+ }
+ else{
+ if (params.COMPILER_VERSION == "release"){
+ compiler = "/opt/rocm/llvm/bin/clang++"
+ }
+ else{
+ compiler = "/llvm-project/build/bin/clang++"
+ }
+ }
+ return compiler
+}
+
+def getDockerImage(Map conf=[:]){
+ env.DOCKER_BUILDKIT=1
+ def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
+ def no_cache = conf.get("no_cache", false)
+ def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+ echo "ccache server: ${env.CK_CCACHE}"
+ if(env.CK_CCACHE)
+ {
+ if(check_host())
+ {
+ echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
+ }
+ else
+ {
+ echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
+ }
+ dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
+ env.CCACHE_DIR = """/tmp/ccache_store"""
+ env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
+ }
+ if(no_cache)
+ {
+ dockerArgs = dockerArgs + " --no-cache "
+ }
+ echo "Docker Args: ${dockerArgs}"
+ def image = getDockerImageName()
+ //Check if image exists
+ def retimage
+ try
+ {
+ echo "Pulling down image: ${image}"
+ retimage = docker.image("${image}")
+ retimage.pull()
+ }
+ catch(Exception ex)
+ {
+ error "Unable to locate image: ${image}"
+ }
+ return [retimage, image]
+}
+
+def buildDocker(install_prefix){
+ show_node_info()
+ env.DOCKER_BUILDKIT=1
+ checkout scm
+ def image_name = getDockerImageName()
+ echo "Building Docker for ${image_name}"
+ def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+ echo "ccache server: ${env.CK_CCACHE}"
+ if(env.CK_CCACHE)
+ {
+ if(check_host())
+ {
+ echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
+ }
+ else
+ {
+ echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
+ }
+ dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
+ env.CCACHE_DIR = """/tmp/ccache_store"""
+ env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
+ }
+
+ echo "Build Args: ${dockerArgs}"
+ try{
+ if(params.BUILD_DOCKER){
+ //force building the new docker if that parameter is true
+ echo "Building image: ${image_name}"
+ retimage = docker.build("${image_name}", dockerArgs + ' .')
+ retimage.push()
+ }
+ else{
+ echo "Checking for image: ${image_name}"
+ sh "docker manifest inspect --insecure ${image_name}"
+ echo "Image: ${image_name} found!! Skipping building image"
+ }
+ }
+ catch(Exception ex){
+ echo "Unable to locate image: ${image_name}. Building image now"
+ retimage = docker.build("${image_name}", dockerArgs + ' .')
+ retimage.push()
+ }
+}
+
+def cmake_build(Map conf=[:]){
+
+ def compiler = build_compiler()
+ def config_targets = conf.get("config_targets","check")
+ def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "")
+ def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
+ def prefixpath = conf.get("prefixpath","/opt/rocm")
+ def setup_args = conf.get("setup_args","")
+
+ if (prefixpath != "/usr/local"){
+ setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
+ }
+
+ def build_type_debug = (conf.get("build_type",'release') == 'debug')
+
+ //cmake_env can overwrite default CXX variables.
+ def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","")
+
+ def package_build = (conf.get("package_build","") == "true")
+
+ if (package_build == true) {
+ config_targets = "package"
+ }
+
+ if(conf.get("build_install","") == "true")
+ {
+ config_targets = 'install ' + config_targets
+ setup_args = ' -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install' + setup_args
+ } else{
+ setup_args = ' -DBUILD_DEV=On' + setup_args
+ }
+
+ if(build_type_debug){
+ setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args
+ }else{
+ setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
+ }
+ if(env.CK_CCACHE)
+ {
+ setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
+ }
+ echo "ccache server: ${env.CK_CCACHE}"
+
+ def pre_setup_cmd = """
+ echo \$HSA_ENABLE_SDMA
+ ulimit -c unlimited
+ rm -rf build
+ mkdir build
+ rm -rf install
+ mkdir install
+ cd build
+ """
+ def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
+ // reduce parallelism when compiling, clang uses too much memory
+ def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 2 )) ${config_targets}")
+ def execute_cmd = conf.get("execute_cmd", "")
+
+ def cmd = conf.get("cmd", """
+ ${pre_setup_cmd}
+ ${setup_cmd}
+ ${build_cmd}
+ ${execute_cmd}
+ """)
+
+ echo cmd
+ sh cmd
+
+ // Only archive from master or develop
+ if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) {
+ archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
+ }
+}
+
+def buildHipClangJob(Map conf=[:]){
+ show_node_info()
+
+ env.HSA_ENABLE_SDMA=0
+ checkout scm
+
+ def image = getDockerImageName()
+ def prefixpath = conf.get("prefixpath", "/opt/rocm")
+
+ // Jenkins is complaining about the render group
+ def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ if (conf.get("enforce_xnack_on", false)) {
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+ }
+ def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+ if (params.COMPILER_VERSION != "release"){
+ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+ }
+
+ def variant = env.STAGE_NAME
+
+ def retimage
+ (retimage, image) = getDockerImage(conf)
+
+ gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+ timeout(time: 5, unit: 'HOURS')
+ {
+ cmake_build(conf)
+ }
+ }
+ }
+ return retimage
+}
+
+def reboot(){
+ build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
+}
+
+def buildHipClangJobAndReboot(Map conf=[:]){
+ try{
+ buildHipClangJob(conf)
+ }
+ catch(e){
+ echo "throwing error exception for the stage"
+ echo 'Exception occurred: ' + e.toString()
+ throw e
+ }
+ finally{
+ if (!conf.get("no_reboot", false)) {
+ reboot()
+ }
+ }
+}
+
+def runCKProfiler(Map conf=[:]){
+ show_node_info()
+
+ env.HSA_ENABLE_SDMA=0
+ checkout scm
+
+ def image = getDockerImageName()
+ def prefixpath = conf.get("prefixpath", "/opt/rocm")
+
+ // Jenkins is complaining about the render group
+ def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ if (conf.get("enforce_xnack_on", false)) {
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+ }
+ def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+ if (params.COMPILER_VERSION != "release"){
+ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+ }
+
+ def variant = env.STAGE_NAME
+ def retimage
+
+ gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+ try {
+ (retimage, image) = getDockerImage(conf)
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES'){
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+ throw new Exception ("GPU not found")
+ }
+ else{
+ echo "GPU is OK"
+ }
+ }
+ }
+ }
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ catch(Exception ex) {
+ retimage = docker.build("${image}", dockerArgs + " --no-cache .")
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES'){
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+ throw new Exception ("GPU not found")
+ }
+ else{
+ echo "GPU is OK"
+ }
+ }
+ }
+ }
+
+ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+ timeout(time: 24, unit: 'HOURS')
+ {
+ //cmake_build(conf)
+ //instead of building, just unstash the ckProfiler and install it
+ sh """
+ rm -rf build
+ mkdir build
+ """
+ dir("build"){
+ unstash 'ckProfiler.tar.gz'
+ sh 'tar -xvf ckProfiler.tar.gz'
+ }
+
+ dir("script"){
+ if (params.RUN_FULL_QA){
+ sh "./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+ archiveArtifacts "perf_gemm.log"
+ archiveArtifacts "perf_resnet50_N256.log"
+ archiveArtifacts "perf_resnet50_N4.log"
+ archiveArtifacts "perf_batched_gemm.log"
+ archiveArtifacts "perf_grouped_gemm.log"
+ archiveArtifacts "perf_conv_fwd.log"
+ archiveArtifacts "perf_conv_bwd_data.log"
+ archiveArtifacts "perf_gemm_bilinear.log"
+ archiveArtifacts "perf_reduction.log"
+ archiveArtifacts "perf_splitK_gemm_verify.log"
+ archiveArtifacts "perf_splitK_gemm.log"
+ archiveArtifacts "perf_onnx_gemm.log"
+ // stash perf files to master
+ stash name: "perf_gemm.log"
+ stash name: "perf_resnet50_N256.log"
+ stash name: "perf_resnet50_N4.log"
+ stash name: "perf_batched_gemm.log"
+ stash name: "perf_grouped_gemm.log"
+ stash name: "perf_conv_fwd.log"
+ stash name: "perf_conv_bwd_data.log"
+ stash name: "perf_gemm_bilinear.log"
+ stash name: "perf_reduction.log"
+ stash name: "perf_splitK_gemm.log"
+ stash name: "perf_onnx_gemm.log"
+ //we will process results on the master node
+ }
+ else{
+ sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+ archiveArtifacts "perf_gemm.log"
+ archiveArtifacts "perf_resnet50_N256.log"
+ archiveArtifacts "perf_resnet50_N4.log"
+ // stash perf files to master
+ stash name: "perf_gemm.log"
+ stash name: "perf_resnet50_N256.log"
+ stash name: "perf_resnet50_N4.log"
+ //we will process the results on the master node
+ }
+ }
+ }
+ }
+ }
+ return retimage
+}
+
+def runPerfTest(Map conf=[:]){
+ try{
+ runCKProfiler(conf)
+ }
+ catch(e){
+ echo "throwing error exception in performance tests"
+ echo 'Exception occurred: ' + e.toString()
+ throw e
+ }
+ finally{
+ if (!conf.get("no_reboot", false)) {
+ reboot()
+ }
+ }
+}
+
+def Build_CK(Map conf=[:]){
+ show_node_info()
+
+ env.HSA_ENABLE_SDMA=0
+ checkout scm
+
+ def image = getDockerImageName()
+ def prefixpath = conf.get("prefixpath", "/opt/rocm")
+
+ // Jenkins is complaining about the render group
+ def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ if (conf.get("enforce_xnack_on", false)) {
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+ }
+ def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+ if (params.COMPILER_VERSION != "release"){
+ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+ }
+
+ def variant = env.STAGE_NAME
+ def retimage
+
+ gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+ try {
+ (retimage, image) = getDockerImage(conf)
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES'){
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+ throw new Exception ("GPU not found")
+ }
+ else{
+ echo "GPU is OK"
+ }
+ }
+ }
+ }
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ catch(Exception ex) {
+ retimage = docker.build("${image}", dockerArgs + " --no-cache .")
+ withDockerContainer(image: image, args: dockerOpts) {
+ timeout(time: 5, unit: 'MINUTES'){
+ sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
+ if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+ throw new Exception ("GPU not found")
+ }
+ else{
+ echo "GPU is OK"
+ }
+ }
+ }
+ }
+ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+ timeout(time: 24, unit: 'HOURS')
+ {
+ cmake_build(conf)
+ dir("build"){
+ //run tests and examples
+ sh 'make -j check'
+ //we only need the ckProfiler to run the performance tests, so we pack and stash it
+ sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
+ stash "ckProfiler.tar.gz"
+ }
+ }
+ }
+ }
+ return retimage
+}
+
+def Build_CK_and_Reboot(Map conf=[:]){
+ try{
+ Build_CK(conf)
+ }
+ catch(e){
+ echo "throwing error exception while building CK"
+ echo 'Exception occurred: ' + e.toString()
+ throw e
+ }
+ finally{
+ if (!conf.get("no_reboot", false)) {
+ reboot()
+ }
+ }
+}
+
+def process_results(Map conf=[:]){
+ env.HSA_ENABLE_SDMA=0
+ checkout scm
+ def image = getDockerImageName()
+ def prefixpath = "/opt/rocm"
+
+ // Jenkins is complaining about the render group
+ def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+ if (conf.get("enforce_xnack_on", false)) {
+ dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+ }
+
+ def variant = env.STAGE_NAME
+ def retimage
+
+ gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+ try {
+ (retimage, image) = getDockerImage(conf)
+ }
+ catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+ echo "The job was cancelled or aborted"
+ throw e
+ }
+ }
+
+ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+ timeout(time: 1, unit: 'HOURS'){
+ try{
+ dir("script"){
+ if (params.RUN_FULL_QA){
+ // unstash perf files to master
+ unstash "perf_gemm.log"
+ unstash "perf_resnet50_N256.log"
+ unstash "perf_resnet50_N4.log"
+ unstash "perf_batched_gemm.log"
+ unstash "perf_grouped_gemm.log"
+ unstash "perf_conv_fwd.log"
+ unstash "perf_conv_bwd_data.log"
+ unstash "perf_gemm_bilinear.log"
+ unstash "perf_reduction.log"
+ unstash "perf_splitK_gemm.log"
+ unstash "perf_onnx_gemm.log"
+ sh "./process_qa_data.sh"
+ }
+ else{
+ // unstash perf files to master
+ unstash "perf_gemm.log"
+ unstash "perf_resnet50_N256.log"
+ unstash "perf_resnet50_N4.log"
+ sh "./process_perf_data.sh"
+ }
+ }
+ }
+ catch(e){
+ echo "throwing error exception while processing performance test results"
+ echo 'Exception occurred: ' + e.toString()
+ throw e
+ }
+ }
+ }
+}
+
+//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;COMPILER_VERSION=release
+ 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open''' : ""
+
+pipeline {
+ agent none
+ triggers {
+ parameterizedCron(CRON_SETTINGS)
+ }
+ options {
+ parallelsAlwaysFailFast()
+ }
+ parameters {
+ booleanParam(
+ name: "BUILD_DOCKER",
+ defaultValue: false,
+ description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
+ string(
+ name: 'ROCMVERSION',
+ defaultValue: '5.3',
+ description: 'Specify which ROCM version to use: 5.2.3, or 5.3 (default), etc.')
+ string(
+ name: 'COMPILER_VERSION',
+ defaultValue: 'release',
+ description: 'Specify which version of compiler to use: ck-9110, release (default), or amd-stg-open.')
+ string(
+ name: 'COMPILER_COMMIT',
+ defaultValue: '',
+ description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit (default), or use 8a82e4eb7ba28521ba9a9424a0315a8a16590424 commit of amd-stg-open branch.')
+ string(
+ name: 'BUILD_COMPILER',
+ defaultValue: 'hipcc',
+ description: 'Specify whether to build CK with hipcc (default) or with clang.')
+ booleanParam(
+ name: "RUN_FULL_QA",
+ defaultValue: false,
+ description: "Select whether to run small set of performance tests (default) or full QA")
+ }
+ environment{
+ dbuser = "${dbuser}"
+ dbpassword = "${dbpassword}"
+ dbsship = "${dbsship}"
+ dbsshport = "${dbsshport}"
+ dbsshuser = "${dbsshuser}"
+ dbsshpassword = "${dbsshpassword}"
+ status_wrapper_creds = "${status_wrapper_creds}"
+ gerrit_cred="${gerrit_cred}"
+ DOCKER_BUILDKIT = "1"
+ }
+ stages{
+ stage("Build Docker"){
+ //when {
+ // beforeAgent true
+ // expression { params.BUILD_DOCKER.toBoolean() }
+ //}
+ parallel{
+ stage('Docker /opt/rocm'){
+ agent{ label rocmnode("nogpu") }
+ steps{
+ buildDocker('/opt/rocm')
+ }
+ }
+ }
+ }
+ stage("Static checks") {
+ parallel{
+ stage('Clang Format') {
+ agent{ label rocmnode("nogpu") }
+ environment{
+ execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
+ -o -not -path \'*.git*\' -iname \'*.hpp\' \
+ -o -not -path \'*.git*\' -iname \'*.cpp\' \
+ -o -iname \'*.h.in\' \
+ -o -iname \'*.hpp.in\' \
+ -o -iname \'*.cpp.in\' \
+ -o -iname \'*.cl\' \
+ | grep -v 'build/' \
+ | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-10 -style=file {} | diff - {}\'"
+ }
+ steps{
+ buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+ }
+ }
+ }
+ }
+
+ stage("Build CK and run Tests")
+ {
+ parallel
+ {
+ stage("Build CK and run Tests")
+ {
+ agent{ label rocmnode("gfx908 || gfx90a") }
+ environment{
+ setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" """ : """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 " """ }"
+ execute_args = "${params.COMPILER_VERSION == "ck-9110" ? """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908,gfx90a" -DCMAKE_CXX_FLAGS="-O3" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ }"
+ }
+ steps{
+ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+ }
+ }
+ }
+ }
+
+ stage("Performance Tests")
+ {
+ parallel
+ {
+ stage("Run ckProfiler: gfx908 or gfx90a")
+ {
+ when {
+ beforeAgent true
+ expression { !params.RUN_FULL_QA.toBoolean() }
+ }
+ options { retry(2) }
+ agent{ label rocmnode("gfx908 || gfx90a")}
+ environment{
+ setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}"
+ }
+ steps{
+ runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
+ }
+ }
+ stage("Run ckProfiler: gfx90a")
+ {
+ when {
+ beforeAgent true
+ expression { params.RUN_FULL_QA.toBoolean() }
+ }
+ options { retry(2) }
+ agent{ label rocmnode("gfx90a")}
+ environment{
+ setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}"
+ }
+ steps{
+ runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
+ }
+ }
+ }
+ }
+ stage("Process Performance Test Results")
+ {
+ parallel
+ {
+ stage("Process results"){
+ agent { label 'mici' }
+ steps{
+ process_results()
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/3rdparty/composable_kernel/LICENSE b/3rdparty/composable_kernel/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..2fe9a8455efaeda2eab474b2aa038ec2d9e76841
--- /dev/null
+++ b/3rdparty/composable_kernel/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2018- , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang)
+Copyright (c) 2019- , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang)
+Copyright (c) 2022- , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan)
+Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang)
+Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah)
+Copyright (c) 2020 , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
+Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
+
+SPDX-License-Identifier: MIT
+Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/3rdparty/composable_kernel/README.md b/3rdparty/composable_kernel/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..129996e268033831a2d29cc96d16edaed2805caa
--- /dev/null
+++ b/3rdparty/composable_kernel/README.md
@@ -0,0 +1,96 @@
+# Composable Kernel
+
+## Methodology
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+
+CK utilizes two concepts to achieve performance portability and code maintainability:
+* A tile-based programming model
+* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
+
+
+
+## Code Structure
+Current CK library are structured into 4 layers:
+* "Templated Tile Operators" layer
+* "Templated Kernel and Invoker" layer
+* "Instantiated Kernel and Invoker" layer
+* "Client API" layer
+
+
+
+## Contributors
+The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
+
+## Citation
+If you use CK, please use following citations:
+* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
+* [CITATION.cff](/CITATION.cff)
+
+## License
+CK is released under the MIT license. [License File](/LICENSE)
+
+
+# Build CK
+
+## Build docker image
+```bash
+DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
+```
+
+## Launch docker
+```bash
+docker run \
+-it \
+--privileged \
+--group-add sudo \
+-w /root/workspace \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
+ck:latest \
+/bin/bash
+```
+
+## Build CK
+```bash
+mkdir build && cd build
+
+# Need to specify target ID, example below is for gfx908 and gfx90a
+cmake \
+-D CMAKE_PREFIX_PATH=/opt/dtk-23.04 \
+-D CMAKE_CXX_COMPILER=/opt/dtk-23.04/bin/hipcc \
+-D CMAKE_CXX_FLAGS="-O3" \
+-D CMAKE_BUILD_TYPE=Release \
+-D GPU_TARGETS="gfx906;gfx926" \
+..
+```
+
+### Build examples and tests
+```bash
+ make -j examples tests
+ make test
+```
+
+Instructions for running each individual examples are under [example](/example)
+
+
+## Build ckProfiler
+```bash
+ make -j ckProfiler
+```
+Instructions for running ckProfiler are under [profiler](/profiler)
+
+## Install CK
+```bash
+make install
+```
+
+## Using CK as pre-built kernel library
+Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
+
+## Caveat
+### Kernel Timing and Verification
+CK's own kernel timer will warn up kernel once, and then run it multiple times
+to get average kernel time. For some kernels that use atomic add, this will cause
+output buffer to be accumulated multiple times, causing verification failure.
+To work around it, do not use CK's own timer and do verification at the same time.
+CK's own timer and verification in each example and ckProfiler can be enabled or
+disabled from command line.
diff --git a/3rdparty/composable_kernel/client_example/01_gemm/CMakeLists.txt b/3rdparty/composable_kernel/client_example/01_gemm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9e741192f90b8216e4b3abe32ae8971fb45ddfee
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/01_gemm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_gemm gemm.cpp)
+target_link_libraries(client_gemm PRIVATE composable_kernel::device_operations)
diff --git a/3rdparty/composable_kernel/client_example/01_gemm/gemm.cpp b/3rdparty/composable_kernel/client_example/01_gemm/gemm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..668c1bf0041847f3e0a9842a693bb8c1c8e4811e
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/01_gemm/gemm.cpp
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+using ADataType = F32;
+using BDataType = F32;
+using CDataType = F32;
+
+using ALayout = Row;
+using BLayout = Row;
+using CLayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideC = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 7)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideC = std::stoi(argv[6]);
+ }
+ else
+ {
+ printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+ ADataType *in_data=new ADataType[K*N];
+ for(int i=0;i;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto c_element_op = CElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+ std::cout << op_name << "support this problem" << std::endl;
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ // std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ a_element_op,
+ b_element_op,
+ c_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ hipMemcpy(in_data,c_device_buf.GetDeviceBuffer(),sizeof(CDataType) * f_matrix_space_size(M, N, StrideA, CLayout{}),hipMemcpyDeviceToHost);
+ for(int i=0;i<100;i++){
+ std::cout<
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideD0 = 0;
+ ck::index_t StrideD1 = 4096;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 9)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideD0 = std::stoi(argv[6]);
+ StrideD1 = std::stoi(argv[7]);
+ StrideE = std::stoi(argv[8]);
+ }
+ else
+ {
+ printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+ f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+ SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
+ f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::AddAddFastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer(),
+ d1_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0, StrideD1},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer(),
+ d1_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0, StrideD1},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/3rdparty/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp b/3rdparty/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..512555f978efc6932e38ff31ec20ebf3aab4b063
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = AddFastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using D0DataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using D0Layout = Row;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideD0 = 0;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 8)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideD0 = std::stoi(argv[6]);
+ StrideE = std::stoi(argv[8]);
+ }
+ else
+ {
+ printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+ f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::AddFastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d0_m_n_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ std::array{StrideD0},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/3rdparty/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp b/3rdparty/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72372310321e297b3a0d9101c808eda273675dde
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu = ck::tensor_operation::element_wise::FastGelu;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = FastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // GEMM shape
+ ck::index_t M = 3840;
+ ck::index_t N = 4096;
+ ck::index_t K = 4096;
+
+ ck::index_t StrideA = 4096;
+ ck::index_t StrideB = 4096;
+ ck::index_t StrideE = 4096;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 7)
+ {
+ M = std::stoi(argv[1]);
+ N = std::stoi(argv[2]);
+ K = std::stoi(argv[3]);
+
+ StrideA = std::stoi(argv[4]);
+ StrideB = std::stoi(argv[5]);
+ StrideE = std::stoi(argv[8]);
+ }
+ else
+ {
+ printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideE\n");
+ exit(0);
+ }
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+ ALayout,
+ BLayout,
+ ck::Tuple<>,
+ ELayout,
+ ADataType,
+ BDataType,
+ ck::Tuple<>,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::FastGelu>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ {},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ {},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ {},
+ e_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ {},
+ StrideE,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/3rdparty/composable_kernel/client_example/03_gemm_layernorm/CMakeLists.txt b/3rdparty/composable_kernel/client_example/03_gemm_layernorm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3742e70844b96575e263b22a14b0bb8c4cde7a43
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/03_gemm_layernorm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp)
+target_link_libraries(client_gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations)
diff --git a/3rdparty/composable_kernel/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp b/3rdparty/composable_kernel/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c259407d4608f90bd331ee9a9686b56ad62de90
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ADataType = F16;
+using BDataType = F16;
+using BiasDataType = F32;
+using CDataType = F16;
+using D0DataType = F16;
+using ReduceDataType = F32;
+using GammaDataType = F16;
+using BetaDataType = F16;
+using LayerNormOutDataType = F16;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+template
+bool RunDeviceGemmMeanSquareMean(gemm_reduce_op_ptr& p_op,
+ const void* p_a,
+ const void* p_b,
+ const void* p_bias,
+ const void* p_d0,
+ void* p_c,
+ void* p_mean,
+ void* p_square_mean,
+ int M,
+ int N,
+ int K,
+ int StrideA,
+ int StrideB,
+ int StrideC,
+ int StrideD0,
+ bool time_kernel)
+{
+ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+ using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
+ using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
+
+ auto passOp = PassThrough{};
+ auto squareOp = UnarySquareElementOp{};
+ auto divOp = UnaryDivElementOp{N};
+
+ auto argument_ptr =
+ p_op->MakeArgumentPointer(p_a,
+ p_b,
+ p_bias,
+ {p_d0},
+ p_c,
+ {p_mean, p_square_mean},
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ {StrideD0},
+ {&passOp, &passOp, &passOp}, // functor for a, b, c
+ {&passOp}, // functor for d0
+ {&passOp, &squareOp}, // functor for inputs of reduction
+ {&divOp, &divOp}); // functor for outputs of reduction
+
+ if(p_op->IsSupportedArgument(argument_ptr.get()))
+ {
+ auto invoker_ptr = p_op->MakeInvokerPointer();
+
+ // If we evaluate running time of gemm_reduce. The output may wrong.
+ // Because we need to initialize the reduction tensor before runing the kernel.
+ // However we run kernel many times for time_kernel = trie without reinitialize the out
+ // of reduction tensor.
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+ if(time_kernel)
+ std::cout << "Gemm + reduce Perf: " << std::setw(10) << ave_time << " ms" << std::endl;
+
+ return true;
+ }
+
+ return false;
+}
+
+template
+bool RunDeviceNormalize2D(normalize_op_ptr& p_op,
+ const void* p_x,
+ const void* p_mean,
+ const void* p_square_mean,
+ const void* p_gamma,
+ const void* p_beta,
+ void* p_y,
+ int M,
+ int N,
+ int StrideX,
+ bool time_kernel)
+{
+ std::array input = {p_x, p_mean, p_square_mean, p_gamma, p_beta};
+ std::array output = {p_y};
+ auto normalize_functor = ck::tensor_operation::element_wise::Normalize{};
+
+ std::array xyLengths = {M, N};
+ std::array xyStrides = {StrideX, 1};
+
+ auto argument_ptr = p_op->MakeArgumentPointer(xyLengths,
+ {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
+ {xyStrides},
+ input,
+ output,
+ ck::tensor_operation::element_wise::Normalize{});
+
+ if(p_op->IsSupportedArgument(argument_ptr.get()))
+ {
+ auto invoker_ptr = p_op->MakeInvokerPointer();
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+ if(time_kernel)
+ std::cout << "Normalize Perf: " << std::setw(10) << ave_time << " ms" << std::endl;
+
+ return true;
+ }
+
+ return false;
+}
+
+int main()
+{
+ ck::index_t M = 1024;
+ ck::index_t N = 1024;
+ ck::index_t K = 1024;
+
+ ck::index_t StrideA = 1024;
+ ck::index_t StrideB = 1024;
+ ck::index_t StrideC = 1024;
+ ck::index_t StrideD0 = 1024;
+
+ const auto gemm_reduce_ptrs =
+ ck::tensor_operation::device::instance::get_device_gemm_add_add_mean_squaremean_instances<
+ ADataType,
+ BDataType,
+ CDataType,
+ ALayout,
+ BLayout,
+ CLayout>();
+
+ const auto normalize_ptrs =
+ ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
+ CDataType,
+ ReduceDataType,
+ ReduceDataType,
+ GammaDataType,
+ BetaDataType,
+ LayerNormOutDataType>();
+
+ std::cout << "found " << gemm_reduce_ptrs.size()
+ << " gemm_reduceMean_reduceSquareMean instances" << std::endl;
+
+ std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;
+
+ auto f_matrix_space_size =
+ [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+ using Layout = decltype(layout);
+
+ if(std::is_same::value)
+ {
+ return (nRow - 1) * stride + nCol;
+ }
+ else
+ {
+ return (nCol - 1) * stride + nRow;
+ }
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+ SimpleDeviceMem bias_device_buf(sizeof(BiasDataType) * N);
+ SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+ SimpleDeviceMem d0_device_buf(sizeof(D0DataType) *
+ f_matrix_space_size(M, N, StrideD0, CLayout{}));
+ SimpleDeviceMem reduceMean_device_buf(sizeof(ReduceDataType) * M);
+ SimpleDeviceMem reduceMeanSquare_device_buf(sizeof(ReduceDataType) * M);
+ SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
+ SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
+ SimpleDeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) * M * N);
+
+ bool b_time_kernel = true;
+ bool b_only_run_first_kernel = true;
+
+ // layernorm => (1) + (2)
+ // (1). c = gemm(a, b), reduce_mean(c), reduce_square_mean(c)
+ // (2). normalize(c, mean, square_mean, gamma, beta)
+ for(auto& gemm_reduce_ptr : gemm_reduce_ptrs)
+ {
+ // run first available kernel
+ if(RunDeviceGemmMeanSquareMean(gemm_reduce_ptr,
+ a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ bias_device_buf.GetDeviceBuffer(),
+ d0_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ reduceMean_device_buf.GetDeviceBuffer(),
+ reduceMeanSquare_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ K,
+ StrideA,
+ StrideB,
+ StrideC,
+ StrideD0,
+ b_time_kernel))
+ {
+ if(b_only_run_first_kernel)
+ break;
+ }
+ else
+ {
+ std::cout << gemm_reduce_ptr->GetTypeString() << " does not support this problem"
+ << std::endl;
+ }
+ }
+
+ for(auto& normalize_ptr : normalize_ptrs)
+ {
+ if(RunDeviceNormalize2D(normalize_ptr,
+ c_device_buf.GetDeviceBuffer(),
+ reduceMean_device_buf.GetDeviceBuffer(),
+ reduceMeanSquare_device_buf.GetDeviceBuffer(),
+ gamma_device_buf.GetDeviceBuffer(),
+ beta_device_buf.GetDeviceBuffer(),
+ layerNorm_device_buf.GetDeviceBuffer(),
+ M,
+ N,
+ StrideC,
+ b_time_kernel))
+ {
+ if(b_only_run_first_kernel)
+ break;
+ }
+ else
+ {
+ std::cout << normalize_ptr->GetTypeString() << " does not support this problem"
+ << std::endl;
+ }
+ }
+}
diff --git a/3rdparty/composable_kernel/client_example/04_contraction/CMakeLists.txt b/3rdparty/composable_kernel/client_example/04_contraction/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4bc6780f96d2fe4a4912e3c188b4b5155cc162dd
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/04_contraction/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_executable(client_contraction_scale contraction_scale.cpp)
+target_link_libraries(client_contraction_scale PRIVATE composable_kernel::device_operations)
+
+add_executable(client_contraction_bilinear contraction_bilinear.cpp)
+target_link_libraries(client_contraction_bilinear PRIVATE composable_kernel::device_operations)
+
diff --git a/3rdparty/composable_kernel/client_example/04_contraction/contraction_bilinear.cpp b/3rdparty/composable_kernel/client_example/04_contraction/contraction_bilinear.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..91dead41a4cac19db857b99a233839e9e6647c57
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/04_contraction/contraction_bilinear.cpp
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = Bilinear;
+
+using ADataType = F32;
+using BDataType = F32;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using DDataType = F32;
+using DsDataType = ck::Tuple;
+using EDataType = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // D[M0, M1, N0, N1]
+ std::vector d_ms_ns_lengths{30, 128, 32, 64};
+ std::vector d_ms_ns_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+
+ float alpha = 1.f;
+ float beta = 1.f;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 25)
+ {
+ const ck::index_t M0 = std::stoi(argv[1]);
+ const ck::index_t M1 = std::stoi(argv[2]);
+
+ const ck::index_t N0 = std::stoi(argv[3]);
+ const ck::index_t N1 = std::stoi(argv[4]);
+
+ const ck::index_t K0 = std::stoi(argv[5]);
+ const ck::index_t K1 = std::stoi(argv[6]);
+
+ a_ms_ks_lengths = {M0, M1, K0, K1};
+ a_ms_ks_strides = {
+ std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+ b_ns_ks_lengths = {N0, N1, K0, K1};
+ b_ns_ks_strides = {
+ std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+ d_ms_ns_lengths = {M0, M1, N0, N1};
+ d_ms_ns_strides = {
+ std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+ e_ms_ns_lengths = {M0, M1, N0, N1};
+ e_ms_ns_strides = {
+ std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
+
+ alpha = std::stof(argv[23]);
+ beta = std::stof(argv[24]);
+ }
+ else
+ {
+ printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+ printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+ printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+ printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+ printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+ printf("arg23 to 24: alpha, beta\n");
+ exit(0);
+ }
+
+ auto f_tensor_space_size = [](auto lengths, auto strides) {
+ std::size_t space_size = 1;
+ for(std::size_t i = 0; i < lengths.size(); ++i)
+ {
+ space_size += (lengths[i] - 1) * strides[i];
+ }
+ return space_size;
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+ f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+ f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+ SimpleDeviceMem d_device_buf(sizeof(DDataType) *
+ f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+ f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+ NumDimM,
+ NumDimN,
+ NumDimK,
+ ADataType,
+ BDataType,
+ ck::Tuple,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::Bilinear>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{alpha, beta};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr =
+ op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{d_device_buf.GetDeviceBuffer()},
+ e_device_buf.GetDeviceBuffer(),
+ a_ms_ks_lengths,
+ a_ms_ks_strides,
+ b_ns_ks_lengths,
+ b_ns_ks_strides,
+ std::array, 1>{d_ms_ns_lengths},
+ std::array, 1>{d_ms_ns_strides},
+ e_ms_ns_lengths,
+ e_ms_ns_strides,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ ck::index_t M = ck::accumulate_n(
+ e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+ ck::index_t N = ck::accumulate_n(
+ e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+ ck::index_t K = ck::accumulate_n(
+ a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+ sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ return 0;
+}
diff --git a/3rdparty/composable_kernel/client_example/04_contraction/contraction_scale.cpp b/3rdparty/composable_kernel/client_example/04_contraction/contraction_scale.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e08ee19cdb098b2dfb70a662d59c87008400123
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/04_contraction/contraction_scale.cpp
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale = ck::tensor_operation::element_wise::Scale;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CDEElementOp = Scale;
+
+using ADataType = F32;
+using BDataType = F32;
+using AccDataType = F32;
+using CShuffleDataType = F32;
+using DsDataType = ck::Tuple<>;
+using EDataType = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ // A[M0, M1, K0, K1]
+ std::vector a_ms_ks_lengths{30, 128, 32, 64};
+ std::vector a_ms_ks_strides{524288, 4096, 128, 1};
+ // B[N0, N1, K0, K1]
+ std::vector b_ns_ks_lengths{32, 64, 32, 64};
+ std::vector b_ns_ks_strides{524288, 4096, 128, 1};
+ // E[M0, M1, N0, N1]
+ std::vector e_ms_ns_lengths{30, 128, 32, 64};
+ std::vector e_ms_ns_strides{524288, 4096, 128, 1};
+
+ float scale = 1.f;
+
+ if(argc == 1)
+ {
+ // use default case
+ }
+ else if(argc == 20)
+ {
+ const ck::index_t M0 = std::stoi(argv[1]);
+ const ck::index_t M1 = std::stoi(argv[2]);
+
+ const ck::index_t N0 = std::stoi(argv[3]);
+ const ck::index_t N1 = std::stoi(argv[4]);
+
+ const ck::index_t K0 = std::stoi(argv[5]);
+ const ck::index_t K1 = std::stoi(argv[6]);
+
+ a_ms_ks_lengths = {M0, M1, K0, K1};
+ a_ms_ks_strides = {
+ std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+ b_ns_ks_lengths = {N0, N1, K0, K1};
+ b_ns_ks_strides = {
+ std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+ e_ms_ns_lengths = {M0, M1, N0, N1};
+ e_ms_ns_strides = {
+ std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+ scale = std::stof(argv[19]);
+ }
+ else
+ {
+ printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+ printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+ printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+ printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+ printf("arg19: scale\n");
+ exit(0);
+ }
+
+ auto f_tensor_space_size = [](auto lengths, auto strides) {
+ std::size_t space_size = 1;
+ for(std::size_t i = 0; i < lengths.size(); ++i)
+ {
+ space_size += (lengths[i] - 1) * strides[i];
+ }
+ return space_size;
+ };
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+ f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+ SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+ f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+ SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+ f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+ using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+ NumDimM,
+ NumDimN,
+ NumDimK,
+ ADataType,
+ BDataType,
+ ck::Tuple<>,
+ EDataType,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::PassThrough,
+ ck::tensor_operation::element_wise::Scale>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ const auto a_element_op = AElementOp{};
+ const auto b_element_op = BElementOp{};
+ const auto cde_element_op = CDEElementOp{scale};
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b_device_buf.GetDeviceBuffer(),
+ std::array{},
+ e_device_buf.GetDeviceBuffer(),
+ a_ms_ks_lengths,
+ a_ms_ks_strides,
+ b_ns_ks_lengths,
+ b_ns_ks_strides,
+ std::array, 0>{},
+ std::array, 0>{},
+ e_ms_ns_lengths,
+ e_ms_ns_strides,
+ a_element_op,
+ b_element_op,
+ cde_element_op);
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ ck::index_t M = ck::accumulate_n(
+ e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+ ck::index_t N = ck::accumulate_n(
+ e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+ ck::index_t K = ck::accumulate_n(
+ a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+ std::size_t flop = std::size_t(2) * M * N * K;
+ std::size_t num_btype =
+ sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ return 0;
+}
diff --git a/3rdparty/composable_kernel/client_example/05_layernorm/CMakeLists.txt b/3rdparty/composable_kernel/client_example/05_layernorm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b582b485d4ce46951aaed98c6256e1c997388bd3
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/05_layernorm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_layernorm2d layernorm2d.cpp)
+target_link_libraries(client_layernorm2d PRIVATE composable_kernel::device_operations)
diff --git a/3rdparty/composable_kernel/client_example/05_layernorm/layernorm2d.cpp b/3rdparty/composable_kernel/client_example/05_layernorm/layernorm2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..adb41171e12a87ffafe42e4f112a3e89cfc7296e
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/05_layernorm/layernorm2d.cpp
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
+
+using XDataType = ck::half_t;
+using GammaDataType = ck::half_t;
+using BetaDataType = ck::half_t;
+using YDataType = ck::half_t;
+using AccDataType = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank = 2;
+constexpr int NumReduceDim = 1;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ ck::index_t M = 1024;
+ ck::index_t N = 1024;
+ ck::index_t Stride = 1024;
+
+ auto xy_size = (M - 1) * Stride + N;
+
+ SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
+ SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
+ SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
+ SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+
+ using DeviceOp = ck::tensor_operation::device::DeviceNormalization;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
+ {Stride, 1}, // xStrides
+ {0, 1}, // gammaStrides
+ {0, 1}, // betaStrides
+ {Stride, 1}, // yStrides
+ {1}, // reduceDims
+ 1e-4,
+ x_device_buf.GetDeviceBuffer(),
+ gamma_device_buf.GetDeviceBuffer(),
+ beta_device_buf.GetDeviceBuffer(),
+ y_device_buf.GetDeviceBuffer(),
+ nullptr,
+ nullptr,
+ PassThrough{});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
+ sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;
+
+ float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+ << op_name << std::endl;
+
+ if(ave_time < best_ave_time)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+ << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
+ {Stride, 1}, // xStrides
+ {1}, // gammaStrides
+ {1}, // betaStrides
+ {Stride, 1}, // yStrides
+ {1}, // reduceDims
+ 1e-4,
+ x_device_buf.GetDeviceBuffer(),
+ gamma_device_buf.GetDeviceBuffer(),
+ beta_device_buf.GetDeviceBuffer(),
+ y_device_buf.GetDeviceBuffer(),
+ nullptr,
+ nullptr,
+ PassThrough{});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/3rdparty/composable_kernel/client_example/06_softmax/CMakeLists.txt b/3rdparty/composable_kernel/client_example/06_softmax/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b38a0fd9e27570e62df7f701572a24fb4ee842f9
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/06_softmax/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_softmax4d softmax4d.cpp)
+target_link_libraries(client_softmax4d PRIVATE composable_kernel::device_operations)
diff --git a/3rdparty/composable_kernel/client_example/06_softmax/softmax4d.cpp b/3rdparty/composable_kernel/client_example/06_softmax/softmax4d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7745ddf34cf7abb65265ada74f0896dedcbbf655
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/06_softmax/softmax4d.cpp
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
+
+using InDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank = 4;
+constexpr int NumReduceDim = 2;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ std::vector in_lengths{2, 8, 128, 1024};
+ std::vector in_strides{8 * 128 * 1024, 128 * 1024, 1024, 1};
+ std::vector reduce_dims{2, 3};
+
+ ck::index_t num_elements =
+ std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies());
+
+ AccDataType alpha{2.0f};
+ AccDataType beta{2.0f};
+
+ SimpleDeviceMem in(sizeof(InDataType) * num_elements);
+ SimpleDeviceMem out(sizeof(OutDataType) * num_elements);
+
+ using DeviceOp = ck::tensor_operation::device::
+ DeviceSoftmax;
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ bool found = false;
+ int best_op_id = -1;
+ float best_ave_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+
+ if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
+ {
+ continue;
+ }
+
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
+ in_strides,
+ reduce_dims,
+ &alpha,
+ &beta,
+ in.GetDeviceBuffer(),
+ out.GetDeviceBuffer(),
+ PassThrough{},
+ PassThrough{});
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t num_bytes = num_elements * sizeof(InDataType) +
+ (beta == 0.0f ? 1 : 2) * num_elements * sizeof(OutDataType);
+
+ float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+ << op_name << std::endl;
+
+ if(ave_time < best_ave_time)
+ {
+ found = true;
+ best_op_id = i;
+ best_op_name = op_name;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+ << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
+ in_strides,
+ reduce_dims,
+ &alpha,
+ &beta,
+ in.GetDeviceBuffer(),
+ out.GetDeviceBuffer(),
+ PassThrough{},
+ PassThrough{});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/3rdparty/composable_kernel/client_example/07_grouped_conv2d_fwd/CMakeLists.txt b/3rdparty/composable_kernel/client_example/07_grouped_conv2d_fwd/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ddc83168acfbbdf3d58b0909b761947c792a3c06
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/07_grouped_conv2d_fwd/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)
+target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations)
diff --git a/3rdparty/composable_kernel/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp b/3rdparty/composable_kernel/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c119c78ab6afc09e0c30f6ade6a52ae9ecb139ea
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+// using InDataType = ck::half_t;
+// using WeiDataType = ck::half_t;
+// using OutDataType = ck::half_t;
+using InDataType = float;
+using WeiDataType = float;
+using OutDataType = float;
+
+using InLayout = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G = 1;
+static constexpr ck::index_t N = 256;
+static constexpr ck::index_t K = 192;
+static constexpr ck::index_t C = 192;
+static constexpr ck::index_t Y = 3;
+static constexpr ck::index_t X = 3;
+static constexpr ck::index_t Hi = 28;
+static constexpr ck::index_t Wi = 28;
+static constexpr ck::index_t Ho = 26;
+static constexpr ck::index_t Wo = 26;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+void printArray(std::array array)
+{
+ for(int i=0;i in_lengths{G, N, Hi, Wi, C};
+ std::array in_strides{0, 0, 0, 0, 1};
+
+ std::array wei_lengths{G, K, Y, X, C};
+ std::array wei_strides{0, 0, 0, 0, 1};
+
+ std::array out_lengths{G, N, Ho, Wo, K};
+ std::array out_strides{0, 0, 0, 0, 1};
+
+ std::partial_sum(rbegin(in_lengths),
+ std::prev(rend(in_lengths)),
+ std::next(rbegin(in_strides)),
+ std::multiplies<>{});
+ std::partial_sum(rbegin(wei_lengths),
+ std::prev(rend(wei_lengths)),
+ std::next(rbegin(wei_strides)),
+ std::multiplies<>{});
+ std::partial_sum(rbegin(out_lengths),
+ std::prev(rend(out_lengths)),
+ std::next(rbegin(out_strides)),
+ std::multiplies<>{});
+
+ printArray(in_lengths);
+ printArray(in_strides);
+
+ // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
+ std::rotate(
+ rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
+ std::rotate(
+ rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
+ std::rotate(
+ rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
+ std::rotate(
+ rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
+ std::rotate(
+ rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
+ std::rotate(
+ rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
+
+ printArray(in_lengths);
+ printArray(in_strides);
+
+ std::array filter_strides{1, 1};
+ std::array filter_dilations{1, 1};
+ std::array input_left_pads{0, 0};
+ std::array input_right_pads{0, 0};
+
+ SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+ SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+ SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+
+ InDataType *in_data=new InDataType[G * N * Hi * Wi * C];
+ for(int i=0;i,
+ OutLayout,
+ InDataType,
+ WeiDataType,
+ ck::Tuple<>,
+ OutDataType,
+ PassThrough,
+ PassThrough,
+ PassThrough>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ int best_op_id = -1;
+ float best_avg_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+ float best_tflops = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ wei_lengths,
+ wei_strides,
+ {},
+ {},
+ out_lengths,
+ out_strides,
+ filter_strides,
+ filter_dilations,
+ input_left_pads,
+ input_right_pads,
+ PassThrough{},
+ PassThrough{},
+ PassThrough{});
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ std::cout << op_name << std::endl;
+ float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
+ std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+ sizeof(WeiDataType) * G * K * Y * X * C +
+ sizeof(OutDataType) * G * N * Ho * Wo * K;
+
+ float tflops = static_cast(flop) / 1.E9 / avg_time;
+ float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+ //std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+ //<< gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ best_op_id = i;
+ best_op_name = op_name;
+ best_avg_time = avg_time;
+ best_gb_per_sec = gb_per_sec;
+ best_tflops = tflops;
+ }
+ }
+ else
+ {
+ //std::cerr << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ if(best_op_id < 0)
+ {
+ std::cerr << "no suitable instance" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+ << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ wei_lengths,
+ wei_strides,
+ {},
+ {},
+ out_lengths,
+ out_strides,
+ filter_strides,
+ filter_dilations,
+ input_left_pads,
+ input_right_pads,
+ PassThrough{},
+ PassThrough{},
+ PassThrough{});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ hipMemcpy(in_data,out.GetDeviceBuffer(),sizeof(OutDataType) * G * N * Ho * Wo * K,hipMemcpyDeviceToHost);
+ for(int i=0;i<10;i++){
+ std::cout<
+#include
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using B0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr static auto MaskingSpec =
+ ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+
+using ADataType = ck::half_t;
+using B0DataType = ck::half_t;
+using B1DataType = ck::half_t;
+using CDataType = ck::half_t;
+using AccDataType = float;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ int G0 = 48;
+ int G1 = 16;
+ int M = 1024;
+ int N = 1024;
+ int K = 64;
+ int O = 64;
+
+ // A layout [G0, M, G1, K]
+ std::vector a_gs_ms_ks_lengths{G0, G1, M, K};
+ std::vector a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
+
+ // B0 layout [G0, N, G1, K]
+ std::vector b0_gs_ns_ks_lengths{G0, G1, N, K};
+ std::vector b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
+
+ // B1 layout [G0, N, G1, O]
+ std::vector b1_gs_os_ns_lengths{G0, G1, O, N};
+ std::vector b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
+
+ // C layout [G0, M, G1, O]
+ std::vector c_gs_ms_os_lengths{G0, G1, M, O};
+ std::vector c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
+
+ SimpleDeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K);
+ SimpleDeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K);
+ SimpleDeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N);
+ SimpleDeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O);
+
+ using DeviceOp =
+ ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2,
+ 1,
+ 1,
+ 1,
+ 1,
+ ADataType,
+ B0DataType,
+ B1DataType,
+ CDataType,
+ ck::Tuple<>,
+ ck::Tuple<>,
+ AElementOp,
+ B0ElementOp,
+ Acc0ElementOp,
+ B1ElementOp,
+ CElementOp,
+ MaskingSpec>;
+
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ int best_op_id = -1;
+ float best_ave_time = 0;
+ float best_tflops = 0;
+ float best_gb_per_sec = 0;
+
+ // profile device op instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b0_device_buf.GetDeviceBuffer(),
+ b1_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ {}, // p_acc0_biases
+ {}, // p_acc1_biases
+ a_gs_ms_ks_lengths,
+ a_gs_ms_ks_strides,
+ b0_gs_ns_ks_lengths,
+ b0_gs_ns_ks_strides,
+ b1_gs_os_ns_lengths,
+ b1_gs_os_ns_strides,
+ c_gs_ms_os_lengths,
+ c_gs_ms_os_strides,
+ {}, // acc0_biases_gs_ms_ns_lengths
+ {}, // acc0_biases_gs_ms_ns_strides
+ {}, // acc1_biases_gs_ms_os_lengths
+ {}, // acc1_biases_gs_ms_os_strides
+ AElementOp{},
+ B0ElementOp{},
+ Acc0ElementOp{1 / sqrtf(K)},
+ B1ElementOp{},
+ CElementOp{});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+
+ float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1;
+ std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+ sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+ G0 * G1;
+
+ float tflops = static_cast(flop) / 1.E9 / ave_time;
+
+ float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+ std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+ << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ best_op_id = i;
+ best_op_name = op_name;
+ best_tflops = tflops;
+ best_ave_time = ave_time;
+ best_gb_per_sec = gb_per_sec;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+ << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best instance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+ auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+ b0_device_buf.GetDeviceBuffer(),
+ b1_device_buf.GetDeviceBuffer(),
+ c_device_buf.GetDeviceBuffer(),
+ {}, // p_acc0_biases
+ {}, // p_acc1_biases
+ a_gs_ms_ks_lengths,
+ a_gs_ms_ks_strides,
+ b0_gs_ns_ks_lengths,
+ b0_gs_ns_ks_strides,
+ b1_gs_os_ns_lengths,
+ b1_gs_os_ns_strides,
+ c_gs_ms_os_lengths,
+ c_gs_ms_os_strides,
+ {}, // acc0_biases_gs_ms_ns_lengths
+ {}, // acc0_biases_gs_ms_ns_strides
+ {}, // acc1_biases_gs_ms_os_lengths
+ {}, // acc1_biases_gs_ms_os_strides
+ AElementOp{},
+ B0ElementOp{},
+ Acc0ElementOp{1 / sqrtf(K)},
+ B1ElementOp{},
+ CElementOp{});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
diff --git a/3rdparty/composable_kernel/client_example/09_quantization/CMakeLists.txt b/3rdparty/composable_kernel/client_example/09_quantization/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7dc9b860c0c427a21e6127fcce5556dbb06089e9
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/09_quantization/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations)
+
+add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations)
+
+add_executable(client_conv2d_fwd_perchannel_quantization conv2d_fwd_perchannel_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_perchannel_quantization PRIVATE composable_kernel::device_operations)
+
+add_executable(client_conv2d_fwd_perlayer_quantization conv2d_fwd_perlayer_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable_kernel::device_operations)
diff --git a/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp b/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bcb0cefa712cf144edf1916adf0c0f97515d56f0
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType = int8_t;
+using WeiDataType = int8_t;
+using BiasDataType = int32_t;
+using RequantScaleDataType = float;
+using OutDataType = int8_t;
+
+using InLayout = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout = ck::tensor_layout::convolution::G_K;
+using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G = 1;
+static constexpr ck::index_t N = 4;
+static constexpr ck::index_t K = 64;
+static constexpr ck::index_t C = 32;
+static constexpr ck::index_t Y = 3;
+static constexpr ck::index_t X = 3;
+static constexpr ck::index_t Hi = 71;
+static constexpr ck::index_t Wi = 71;
+static constexpr ck::index_t Ho = 36;
+static constexpr ck::index_t Wo = 36;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ std::array in_lengths{G, N, C, Hi, Wi};
+ std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+ std::array weight_lengths{G, K, C, Y, X};
+ std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+ std::array bias_lengths{G, N, K, Ho, Wo};
+ std::array bias_strides{K, 0, 1, 0, 0};
+ std::array requant_scale_lengths{G, N, K, Ho, Wo};
+ std::array requant_scale_strides{K, 0, 1, 0, 0};
+ std::array out_lengths{G, N, C, Ho, Wo};
+ std::array out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+ std::array in_left_pad{1, 1};
+ std::array in_right_pad{1, 1};
+ std::array conv_strides{2, 2};
+ std::array conv_dilations{1, 1};
+
+ SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+ SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+ SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+ SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+ SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+ NumDimSpatial,
+ InLayout,
+ WeiLayout,
+ ck::Tuple,
+ OutLayout,
+ InDataType,
+ WeiDataType,
+ ck::Tuple,
+ OutDataType,
+ PassThrough,
+ PassThrough,
+ OutElementOp>;
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ int best_op_id = -1;
+ float best_avg_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+ float best_tflops = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+ auto argument_ptr =
+ op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ weight_lengths,
+ weight_strides,
+ {bias_lengths, requant_scale_lengths},
+ {bias_strides, requant_scale_strides},
+ out_lengths,
+ out_strides,
+ conv_strides,
+ conv_dilations,
+ in_left_pad,
+ in_right_pad,
+ PassThrough{},
+ PassThrough{},
+ OutElementOp{ActivationOp{}});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
+ std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+ G * sizeof(WeiDataType) * K * Y * X * C +
+ G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+ float tflops = static_cast(flop) / 1.E9 / avg_time;
+ float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+ std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ best_op_id = i;
+ best_op_name = op_name;
+ best_avg_time = avg_time;
+ best_gb_per_sec = gb_per_sec;
+ best_tflops = tflops;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+ << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+ auto argument_ptr =
+ op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ weight_lengths,
+ weight_strides,
+ {bias_lengths, requant_scale_lengths},
+ {bias_strides, requant_scale_strides},
+ out_lengths,
+ out_strides,
+ conv_strides,
+ conv_dilations,
+ in_left_pad,
+ in_right_pad,
+ PassThrough{},
+ PassThrough{},
+ OutElementOp{ActivationOp{}});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp b/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26c7aa15e2be0814b20b17f9c2c91fe21e70d961
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType = int8_t;
+using WeiDataType = int8_t;
+using BiasDataType = int32_t;
+using OutDataType = int8_t;
+
+using InLayout = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout = ck::tensor_layout::convolution::G_K;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G = 1;
+static constexpr ck::index_t N = 4;
+static constexpr ck::index_t K = 64;
+static constexpr ck::index_t C = 32;
+static constexpr ck::index_t Y = 3;
+static constexpr ck::index_t X = 3;
+static constexpr ck::index_t Hi = 71;
+static constexpr ck::index_t Wi = 71;
+static constexpr ck::index_t Ho = 36;
+static constexpr ck::index_t Wo = 36;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ std::array in_lengths{G, N, C, Hi, Wi};
+ std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+ std::array weight_lengths{G, K, C, Y, X};
+ std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+ std::array bias_lengths{G, N, K, Ho, Wo};
+ std::array bias_strides{K, 0, 1, 0, 0};
+ std::array out_lengths{G, N, C, Ho, Wo};
+ std::array out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+ std::array in_left_pad{1, 1};
+ std::array in_right_pad{1, 1};
+ std::array conv_strides{2, 2};
+ std::array conv_dilations{1, 1};
+
+ SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+ SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+ SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+ SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+ using DeviceOp =
+ ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD,
+ OutLayout,
+ InDataType,
+ WeiDataType,
+ ck::Tuple,
+ OutDataType,
+ PassThrough,
+ PassThrough,
+ OutElementOp>;
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ int best_op_id = -1;
+ float best_avg_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+ float best_tflops = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {bias.GetDeviceBuffer()},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ weight_lengths,
+ weight_strides,
+ {bias_lengths},
+ {bias_strides},
+ out_lengths,
+ out_strides,
+ conv_strides,
+ conv_dilations,
+ in_left_pad,
+ in_right_pad,
+ PassThrough{},
+ PassThrough{},
+ OutElementOp{0.5f, ActivationOp{}});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
+ std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+ G * sizeof(WeiDataType) * K * Y * X * C +
+ G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+ float tflops = static_cast(flop) / 1.E9 / avg_time;
+ float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+ std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ best_op_id = i;
+ best_op_name = op_name;
+ best_avg_time = avg_time;
+ best_gb_per_sec = gb_per_sec;
+ best_tflops = tflops;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+ << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {bias.GetDeviceBuffer()},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ weight_lengths,
+ weight_strides,
+ {bias_lengths},
+ {bias_strides},
+ out_lengths,
+ out_strides,
+ conv_strides,
+ conv_dilations,
+ in_left_pad,
+ in_right_pad,
+ PassThrough{},
+ PassThrough{},
+ OutElementOp{0.5f, ActivationOp{}});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp b/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..475b2f03b4f558ad4bc319393472b926ebbfee2d
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType = int8_t;
+using WeiDataType = int8_t;
+using RequantScaleDataType = float;
+using OutDataType = int8_t;
+
+using InLayout = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G = 1;
+static constexpr ck::index_t N = 4;
+static constexpr ck::index_t K = 64;
+static constexpr ck::index_t C = 32;
+static constexpr ck::index_t Y = 3;
+static constexpr ck::index_t X = 3;
+static constexpr ck::index_t Hi = 71;
+static constexpr ck::index_t Wi = 71;
+static constexpr ck::index_t Ho = 36;
+static constexpr ck::index_t Wo = 36;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ std::array in_lengths{G, N, C, Hi, Wi};
+ std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+ std::array weight_lengths{G, K, C, Y, X};
+ std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+ std::array requant_scale_lengths{G, N, K, Ho, Wo};
+ std::array requant_scale_strides{K, 0, 1, 0, 0};
+ std::array out_lengths{G, N, C, Ho, Wo};
+ std::array out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+ std::array in_left_pad{1, 1};
+ std::array in_right_pad{1, 1};
+ std::array conv_strides{2, 2};
+ std::array conv_dilations{1, 1};
+
+ SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+ SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+ SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+ SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+ using DeviceOp =
+ ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD,
+ OutLayout,
+ InDataType,
+ WeiDataType,
+ ck::Tuple,
+ OutDataType,
+ PassThrough,
+ PassThrough,
+ OutElementOp>;
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ int best_op_id = -1;
+ float best_avg_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+ float best_tflops = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {requant_scale.GetDeviceBuffer()},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ weight_lengths,
+ weight_strides,
+ {requant_scale_lengths},
+ {requant_scale_strides},
+ out_lengths,
+ out_strides,
+ conv_strides,
+ conv_dilations,
+ in_left_pad,
+ in_right_pad,
+ PassThrough{},
+ PassThrough{},
+ OutElementOp{ActivationOp{}});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
+ std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+ G * sizeof(WeiDataType) * K * Y * X * C +
+ G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+ float tflops = static_cast(flop) / 1.E9 / avg_time;
+ float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+ std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ best_op_id = i;
+ best_op_name = op_name;
+ best_avg_time = avg_time;
+ best_gb_per_sec = gb_per_sec;
+ best_tflops = tflops;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+ << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ weight_lengths,
+ weight_strides,
+ {},
+ {},
+ out_lengths,
+ out_strides,
+ conv_strides,
+ conv_dilations,
+ in_left_pad,
+ in_right_pad,
+ PassThrough{},
+ PassThrough{},
+ OutElementOp{ActivationOp{}});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp b/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..da7b7e6abffd1f5033e4a02700a1a939a950dc0e
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType = int8_t;
+using WeiDataType = int8_t;
+using OutDataType = int8_t;
+
+using InLayout = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G = 1;
+static constexpr ck::index_t N = 4;
+static constexpr ck::index_t K = 64;
+static constexpr ck::index_t C = 32;
+static constexpr ck::index_t Y = 3;
+static constexpr ck::index_t X = 3;
+static constexpr ck::index_t Hi = 71;
+static constexpr ck::index_t Wi = 71;
+static constexpr ck::index_t Ho = 36;
+static constexpr ck::index_t Wo = 36;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+ std::array in_lengths{G, N, C, Hi, Wi};
+ std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+ std::array weight_lengths{G, K, C, Y, X};
+ std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+ std::array out_lengths{G, N, C, Ho, Wo};
+ std::array out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+ std::array in_left_pad{1, 1};
+ std::array in_right_pad{1, 1};
+ std::array conv_strides{2, 2};
+ std::array conv_dilations{1, 1};
+
+ SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+ SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+ SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD,
+ OutLayout,
+ InDataType,
+ WeiDataType,
+ ck::Tuple<>,
+ OutDataType,
+ PassThrough,
+ PassThrough,
+ OutElementOp>;
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ int best_op_id = -1;
+ float best_avg_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+ float best_tflops = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ weight_lengths,
+ weight_strides,
+ {},
+ {},
+ out_lengths,
+ out_strides,
+ conv_strides,
+ conv_dilations,
+ in_left_pad,
+ in_right_pad,
+ PassThrough{},
+ PassThrough{},
+ OutElementOp{0.5f, ActivationOp{}});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
+ std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+ G * sizeof(WeiDataType) * K * Y * X * C +
+ G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+ float tflops = static_cast(flop) / 1.E9 / avg_time;
+ float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+ std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ best_op_id = i;
+ best_op_name = op_name;
+ best_avg_time = avg_time;
+ best_gb_per_sec = gb_per_sec;
+ best_tflops = tflops;
+ }
+ }
+ else
+ {
+ std::cout << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+ << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+ auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {},
+ out.GetDeviceBuffer(),
+ in_lengths,
+ in_strides,
+ weight_lengths,
+ weight_strides,
+ {},
+ {},
+ out_lengths,
+ out_strides,
+ conv_strides,
+ conv_dilations,
+ in_left_pad,
+ in_right_pad,
+ PassThrough{},
+ PassThrough{},
+ OutElementOp{0.5f, ActivationOp{}});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/3rdparty/composable_kernel/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt b/3rdparty/composable_kernel/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e564f3180d8c9295356d90266f2159de47aac060
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
+target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_operations)
diff --git a/3rdparty/composable_kernel/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp b/3rdparty/composable_kernel/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..55c789804230ccccf66d68be9244c5c4111451e6
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G = 32;
+static constexpr ck::index_t N = 256;
+static constexpr ck::index_t K = 192;
+static constexpr ck::index_t C = 192;
+static constexpr ck::index_t Y = 3;
+static constexpr ck::index_t X = 3;
+static constexpr ck::index_t Hi = 28;
+static constexpr ck::index_t Wi = 28;
+static constexpr ck::index_t Ho = 28;
+static constexpr ck::index_t Wo = 28;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main()
+{
+ std::array in_lengths{G, N, Hi, Wi, C};
+ std::array in_strides{0, 0, 0, 0, 1};
+
+ std::array wei_lengths{G, K, Y, X, C};
+ std::array wei_strides{0, 0, 0, 0, 1};
+
+ std::array out_lengths{G, N, Ho, Wo, K};
+ std::array out_strides{0, 0, 0, 0, 1};
+
+ std::partial_sum(rbegin(in_lengths),
+ std::prev(rend(in_lengths)),
+ std::next(rbegin(in_strides)),
+ std::multiplies<>{});
+ std::partial_sum(rbegin(wei_lengths),
+ std::prev(rend(wei_lengths)),
+ std::next(rbegin(wei_strides)),
+ std::multiplies<>{});
+ std::partial_sum(rbegin(out_lengths),
+ std::prev(rend(out_lengths)),
+ std::next(rbegin(out_strides)),
+ std::multiplies<>{});
+
+ // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
+ std::rotate(
+ rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
+ std::rotate(
+ rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
+ std::rotate(
+ rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
+ std::rotate(
+ rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
+ std::rotate(
+ rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
+ std::rotate(
+ rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
+
+ std::array filter_strides{1, 1};
+ std::array filter_dilations{1, 1};
+ std::array input_left_pads{1, 1};
+ std::array input_right_pads{1, 1};
+
+ SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+ SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+ SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+
+ using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD,
+ InLayout,
+ OutDataType,
+ WeiDataType,
+ ck::Tuple<>,
+ InDataType,
+ PassThrough,
+ PassThrough,
+ PassThrough>;
+ // get device op instances
+ const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+ DeviceOp>::GetInstances();
+
+ std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+ std::string best_op_name;
+ int best_op_id = -1;
+ float best_avg_time = std::numeric_limits::max();
+ float best_gb_per_sec = 0;
+ float best_tflops = 0;
+
+ // profile device operation instances
+ std::cout << "Run all instances and do timing" << std::endl;
+
+ for(int i = 0; i < op_ptrs.size(); ++i)
+ {
+ auto& op_ptr = op_ptrs[i];
+ auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {},
+ in.GetDeviceBuffer(),
+ out_lengths,
+ out_strides,
+ wei_lengths,
+ wei_strides,
+ {},
+ {},
+ in_lengths,
+ in_strides,
+ filter_strides,
+ filter_dilations,
+ input_left_pads,
+ input_right_pads,
+ PassThrough{},
+ PassThrough{},
+ PassThrough{});
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+ std::string op_name = op_ptr->GetTypeString();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+ std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
+ std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+ sizeof(WeiDataType) * G * K * Y * X * C +
+ sizeof(OutDataType) * G * N * Ho * Wo * K;
+
+ float tflops = static_cast(flop) / 1.E9 / avg_time;
+ float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+ std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+ << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+ if(tflops > best_tflops)
+ {
+ best_op_id = i;
+ best_op_name = op_name;
+ best_avg_time = avg_time;
+ best_gb_per_sec = gb_per_sec;
+ best_tflops = tflops;
+ }
+ }
+ else
+ {
+ std::cerr << op_name << " does not support this problem" << std::endl;
+ }
+ }
+
+ if(best_op_id < 0)
+ {
+ std::cerr << "no suitable instance" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+ << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+ // run the best intance
+ {
+ auto& op_ptr = op_ptrs[best_op_id];
+ std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+ << std::endl;
+ auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
+ wei.GetDeviceBuffer(),
+ {},
+ in.GetDeviceBuffer(),
+ out_lengths,
+ out_strides,
+ wei_lengths,
+ wei_strides,
+ {},
+ {},
+ in_lengths,
+ in_strides,
+ filter_strides,
+ filter_dilations,
+ input_left_pads,
+ input_right_pads,
+ PassThrough{},
+ PassThrough{},
+ PassThrough{});
+
+ auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+ if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+ {
+ invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+ }
+
+ std::cout << "Done" << std::endl;
+ }
+}
diff --git a/3rdparty/composable_kernel/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt b/3rdparty/composable_kernel/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3e3f6677666545a616a5664080c7fd20ac8ae4e0
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_grouped_conv2d_bwd_weight grouped_conv2d_bwd_weight.cpp)
+target_link_libraries(client_grouped_conv2d_bwd_weight PRIVATE composable_kernel::device_operations)
diff --git a/3rdparty/composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp b/3rdparty/composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ecc8568959555740e88892ff6233ca1a5cf7333
--- /dev/null
+++ b/3rdparty/composable_kernel/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include
+#include
+#include
+#include
+#include
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G = 32;
+static constexpr ck::index_t N = 256;
+static constexpr ck::index_t K = 192;
+static constexpr ck::index_t C = 192;
+static constexpr ck::index_t Y = 3;
+static constexpr ck::index_t X = 3;
+static constexpr ck::index_t Hi = 28;
+static constexpr ck::index_t Wi = 28;
+static constexpr ck::index_t Ho = 28;
+static constexpr ck::index_t Wo = 28;
+
+struct SimpleDeviceMem
+{
+ SimpleDeviceMem() = delete;
+
+ SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+ {
+ (void)hipMalloc(static_cast(&p_mem_), mem_size);
+ }
+
+ void* GetDeviceBuffer() { return p_mem_; }
+
+ ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+ void* p_mem_;
+};
+
+int main()
+{
+ std::array input_spatial_lengths{Hi, Wi};
+ std::array filter_spatial_lengths{Y, X};
+ std::array output_spatial_lengths{Ho, Wo};
+
+ std::array conv_filter_strides{1, 1};
+ std::array