initial commit

7e689d57 · aska-0096 · 7e689d57 · 7e689d57 · 7e689d57 · 7e689d57
Commit 7e689d57 authored Jul 18, 2024 by aska-0096
20 changed files
--- a/.clang-format
+++ b/.clang-format
+---
+Language:        Cpp
+AccessModifierOffset: 0
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:   
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  false
+  AfterObjCDeclaration: true
+  AfterStruct:     true
+  AfterUnion:      true
+  BeforeCatch:     true
+  BeforeElse:      true
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    false
+SpaceAfterCStyleCast: false
+# SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: Never
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
--- a/.clang-tidy
+++ b/.clang-tidy
+CheckOptions:
+  - key: bugprone-reserved-identifier.AllowedIdentifiers
+    value: '__HIP_PLATFORM_HCC__;__HIP_ROCclr__'
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/docs/sphinx" # Location of package manifests
+    open-pull-requests-limit: 10
+    schedule:
+      interval: "daily"
--- a/.gitignore
+++ b/.gitignore
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+# Precompiled Headers
+*.gch
+*.pch
+*.ipch
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+# Fortran module files
+*.mod
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+# Executables
+*.exe
+*.out
+*.app
+# vim tags
+tags
+.tags
+.*.swp
+# Editors
+.vscode
+# build-in-source directory
+build*
+# emacs temporary/backup files
+.\#*
+\#*\#
+*~
+# GDB temporary files
+.gdb_history
+install.dir*
+# documentation artifacts
+_build/
+_images/
+_static/
+_templates/
+_toc.yml
+docBin/
+_doxygen/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+repos:
+-   repo: local
+    hooks:
+    -   id: clang-format
+        name: clang-format
+        entry: clang-format-12 -i --style=file
+        language: system
+        types_or: [c++, inc]
+    -   id: copyright-year-checker
+        name: copyright-year-checker
+        entry: script/check_copyright_year.sh
+        verbose: false
+        language: script
+        types: [c++]
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+version: 2
+build:
+   os: ubuntu-22.04
+   tools:
+      python: "3.8"
+sphinx:
+   configuration: docs/conf.py
+formats: [htmlzip, pdf, epub]
+python:
+   install:
+   - requirements: docs/sphinx/requirements.txt
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# Change Log for Composable Kernel
+Full documentation for Composable Kernel is not yet available.
+## CK 0.2.0 for ROCm 5.5.0
+### Fixed
+- Fixed a bug in 6-dimensional kernels (#555).
+- Fixed grouped ConvBwdWeight test case failure (#524).
+### Optimizations
+- Improve proformance of normalization kernel
+### Added
+- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances.
+- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types.
+- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler.
+- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler.
+- Added support on MI300A/MI300X.
+- Added support on NAVI3x.
+- Added user tutorial (#563).
+- Added more instances for irregular GEMM sizes (#560).
+- Added inter-wave consumer-producer programming model for GEMM kernels (#310).
+- Added multi-D GEMM client APIs (#534).
+- Added multi-embeddings support (#542).
+- Added Navi3x blockwise GEMM and real GEMM support (#541).
+- Added Navi grouped ConvBwdWeight support (#505).
+- Added MaxPool, AvgPool forward (#815).
+- Added MaxPool backward (#750).
+### Changed
+- Changed ...
--- a/CITATION.cff
+++ b/CITATION.cff
+cff-version: 1.2.0
+title: Composable Kernel
+message: If you use this software, please cite using the following metadata.
+type: software
+authors:
+  - given-names: Chao
+    family-names: Liu
+    email: chao.liu2@amd.com
+    affiliation: AMD
+  - given-names: Jing
+    family-names: Zhang
+    email: jing.zhang3@amd.com
+    affiliation: AMD
+  - given-names: Letao
+    family-names: Qin
+    email: letao.qin@amd.com
+    affiliation: AMD
+  - given-names: Qianfeng
+    family-names: Zhang
+    email: qianfeng.zhang@amd.com
+    affiliation: AMD
+  - given-names: Liang
+    family-names: Huang
+    email: carlus.huang@amd.com
+    affiliation: AMD
+  - given-names: Shaojie
+    family-names: Wang
+    email: shaojie.wang@amd.com
+    affiliation: AMD
+  - given-names: Anthony
+    family-names: Chang
+    email: antc@amd.com
+    affiliation: AMD
+  - given-names: Chunyu
+    family-names: Lai
+    email: chunyu.lai@amd.com
+    affiliation: AMD
+  - given-names: Illia
+    family-names: Silin
+    email: illia.silin@amd.com
+    affiliation: AMD
+  - given-names: Adam
+    family-names: Osewski
+    email: adam.osewski@amd.com
+    affiliation: AMD
+  - given-names: Poyen
+    family-names: Chen
+    email: poyen.chen@amd.com
+    affiliation: AMD
+  - given-names: Rosty
+    family-names: Geyyer
+    email: rosty.geyyer@amd.com
+    affiliation: AMD
+  - given-names: Hanwen
+    family-names: Chen
+  - given-names: Tejash
+    family-names: Shah
+  - given-names: Xiaoyan
+    family-names: Zhou
+  - given-names: Jianfeng
+    family-names: Yan
+repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel'
+abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
+keywords:
+  - 'CK, Composable Kernel, Tensor Coordinate Transformation'
+license: MIT
+license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+cmake_minimum_required(VERSION 3.14)
+set(version 1.1.0)
+# Check support for CUDA/HIP in Cmake
+project(composable_kernel VERSION ${version})
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+if (DTYPES)
+    add_definitions(-DDTYPES)
+    if (DTYPES MATCHES "int8")
+        add_definitions(-DCK_ENABLE_INT8)
+        set(CK_ENABLE_INT8 "ON")
+    endif()
+    if (DTYPES MATCHES "fp8")
+        add_definitions(-DCK_ENABLE_FP8)
+        set(CK_ENABLE_FP8 "ON")
+    endif()
+    if (DTYPES MATCHES "fp16")
+        add_definitions(-DCK_ENABLE_FP16)
+        set(CK_ENABLE_FP16 "ON")
+    endif()
+    if (DTYPES MATCHES "fp32")
+        add_definitions(-DCK_ENABLE_FP32)
+        set(CK_ENABLE_FP32 "ON")
+    endif()
+    if (DTYPES MATCHES "fp64")
+        add_definitions(-DCK_ENABLE_FP64)
+        set(CK_ENABLE_FP64 "ON")
+    endif()
+    if (DTYPES MATCHES "bf16")
+        add_definitions(-DCK_ENABLE_BF16)
+        set(CK_ENABLE_BF16 "ON")
+    endif()
+    message("DTYPES macro set to ${DTYPES}")
+else()
+    add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
+    set(CK_ENABLE_ALL_DTYPES "ON")
+endif()
+if(DL_KERNELS)
+    add_definitions(-DDL_KERNELS)
+    set(CK_ENABLE_DL_KERNELS "ON")
+endif()
+if(INSTANCES_ONLY)
+    add_definitions(-DINSTANCES_ONLY)
+    set(CK_ENABLE_INSTANCES_ONLY "ON")
+endif()
+# CK config file to record supported datatypes, etc.
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/config.h.in" "${PROJECT_BINARY_DIR}/include/ck/config.h")
+# CK version file to record release version as well as git commit hash
+find_package(Git REQUIRED)
+execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
+configure_file("${PROJECT_SOURCE_DIR}/include/ck/version.h.in" "${PROJECT_BINARY_DIR}/include/ck/version.h")
+enable_testing()
+set(ROCM_SYMLINK_LIBS OFF)
+find_package(ROCM REQUIRED PATHS /opt/rocm)
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMSetupVersion)
+include(ROCMInstallSymlinks)
+include(ROCMCreatePackage)
+include(CheckCXXCompilerFlag)
+include(ROCMCheckTargetIds)
+include(TargetFlags)
+rocm_setup_version(VERSION ${version})
+list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
+message("GPU_TARGETS= ${GPU_TARGETS}")
+message("checking which targets are supported")
+#This is the list of targets to be used in case GPU_TARGETS is not set on command line
+#These targets will be filtered and only supported ones will be used
+#Setting GPU_TARGETS on command line will override this list
+if(NOT PROFILER_ONLY)
+    rocm_check_target_ids(DEFAULT_GPU_TARGETS
+        TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
+else()
+    add_definitions(-DPROFILER_ONLY)
+    if(GPU_TARGETS)
+        message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11")
+    endif()
+    if(GPU_ARCH MATCHES "gfx9")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942")
+    elseif(GPU_ARCH MATCHES "gfx10")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
+    elseif(GPU_ARCH MATCHES "gfx11")
+        rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
+    else()
+        message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11")
+    endif()
+endif()
+message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
+set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " ")
+if(GPU_TARGETS)
+    message("Building CK for the following targets: ${GPU_TARGETS}")
+else()
+    message("Building CK for the following targets: ${AMDGPU_TARGETS}")
+endif()
+find_package(hip)
+# No assumption that HIP kernels are launched with uniform block size for backward compatibility
+# SWDEV-413293 and https://reviews.llvm.org/D155213
+math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) * 100000 + ${hip_VERSION_PATCH}")
+message("hip_version_flat=${hip_VERSION_FLAT}")
+if(${hip_VERSION_FLAT} GREATER 500723302)
+   message("Adding the fno-offload-uniform-block compiler flag")
+   add_compile_options(-fno-offload-uniform-block)
+endif()
+option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
+option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
+if(USE_BITINT_EXTENSION_INT4)
+    add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    add_compile_options(-Wno-bit-int-extension)
+    message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
+endif()
+if(USE_OPT_NAVI3X)
+    add_compile_options(-mcumode)
+    add_compile_options(-mno-wavefrontsize64)
+    message("CK compiled with USE_OPT_NAVI3X set to ${USE_OPT_NAVI3X}")
+endif()
+## Threads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+link_libraries(Threads::Threads)
+## C++
+enable_language(CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+## OpenMP
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+	# workaround issue hipcc in rocm3.5 cannot find openmp
+	set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+	set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+	set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
+	set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+	set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+	set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+else()
+	find_package(OpenMP REQUIRED)
+endif()
+message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
+message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
+message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
+message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
+link_libraries(${OpenMP_gomp_LIBRARY})
+link_libraries(${OpenMP_pthread_LIBRARY})
+## HIP
+find_package(HIP REQUIRED)
+# Override HIP version in config.h, if necessary.
+# The variables set by find_package() can't be overwritten,
+# therefore let's use intermediate variables.
+set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}")
+set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}")
+set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}")
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR )
+    set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}")
+    message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}")
+endif()
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR )
+    set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}")
+    message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}")
+endif()
+if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
+    set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}")
+    message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
+endif()
+message(STATUS "Build with HIP ${HIP_VERSION}")
+link_libraries(hip::device)
+add_compile_definitions(__HIP_PLATFORM_HCC__=1)
+## tidy
+include(EnableCompilerWarnings)
+set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
+if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+    set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
+# Enable tidy on hip
+elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
+    set(CK_TIDY_ERRORS ALL)
+endif()
+include(ClangTidy)
+enable_clang_tidy(
+    CHECKS
+        *
+        -abseil-*
+        -android-cloexec-fopen
+        # Yea we shouldn't be using rand()
+        -cert-msc30-c
+        -bugprone-exception-escape
+        -bugprone-macro-parentheses
+        -cert-env33-c
+        -cert-msc32-c
+        -cert-msc50-cpp
+        -cert-msc51-cpp
+        -cert-dcl37-c
+        -cert-dcl51-cpp
+        -clang-analyzer-alpha.core.CastToStruct
+        -clang-analyzer-optin.performance.Padding
+        -clang-diagnostic-deprecated-declarations
+        -clang-diagnostic-extern-c-compat
+        -clang-diagnostic-unused-command-line-argument
+        -cppcoreguidelines-avoid-c-arrays
+        -cppcoreguidelines-avoid-magic-numbers
+        -cppcoreguidelines-explicit-virtual-functions
+        -cppcoreguidelines-init-variables
+        -cppcoreguidelines-macro-usage
+        -cppcoreguidelines-non-private-member-variables-in-classes
+        -cppcoreguidelines-pro-bounds-array-to-pointer-decay
+        -cppcoreguidelines-pro-bounds-constant-array-index
+        -cppcoreguidelines-pro-bounds-pointer-arithmetic
+        -cppcoreguidelines-pro-type-member-init
+        -cppcoreguidelines-pro-type-reinterpret-cast
+        -cppcoreguidelines-pro-type-union-access
+        -cppcoreguidelines-pro-type-vararg
+        -cppcoreguidelines-special-member-functions
+        -fuchsia-*
+        -google-explicit-constructor
+        -google-readability-braces-around-statements
+        -google-readability-todo
+        -google-runtime-int
+        -google-runtime-references
+        -hicpp-vararg
+        -hicpp-braces-around-statements
+        -hicpp-explicit-conversions
+        -hicpp-named-parameter
+        -hicpp-no-array-decay
+        # We really shouldn't use bitwise operators with signed integers, but
+        # opencl leaves us no choice
+        -hicpp-avoid-c-arrays
+        -hicpp-signed-bitwise
+        -hicpp-special-member-functions
+        -hicpp-uppercase-literal-suffix
+        -hicpp-use-auto
+        -hicpp-use-equals-default
+        -hicpp-use-override
+        -llvm-header-guard
+        -llvm-include-order
+        #-llvmlibc-*
+        -llvmlibc-restrict-system-libc-headers
+        -llvmlibc-callee-namespace
+        -llvmlibc-implementation-in-namespace
+        -llvm-else-after-return
+        -llvm-qualified-auto
+        -misc-misplaced-const
+        -misc-non-private-member-variables-in-classes
+        -misc-no-recursion
+        -modernize-avoid-bind
+        -modernize-avoid-c-arrays
+        -modernize-pass-by-value
+        -modernize-use-auto
+        -modernize-use-default-member-init
+        -modernize-use-equals-default
+        -modernize-use-trailing-return-type
+        -modernize-use-transparent-functors
+        -performance-unnecessary-value-param
+        -readability-braces-around-statements
+        -readability-else-after-return
+        # we are not ready to use it, but very useful
+        -readability-function-cognitive-complexity
+        -readability-isolate-declaration
+        -readability-magic-numbers
+        -readability-named-parameter
+        -readability-uppercase-literal-suffix
+        -readability-convert-member-functions-to-static
+        -readability-qualified-auto
+        -readability-redundant-string-init
+        # too many narrowing conversions in our code
+        -bugprone-narrowing-conversions
+        -cppcoreguidelines-narrowing-conversions
+        -altera-struct-pack-align
+        -cppcoreguidelines-prefer-member-initializer
+        ${CK_TIDY_CHECKS}
+        ${CK_TIDY_ERRORS}
+    HEADER_FILTER
+        "\.hpp$"
+    EXTRA_ARGS
+        -DCK_USE_CLANG_TIDY
+)
+include(CppCheck)
+enable_cppcheck(
+    CHECKS
+        warning
+        style
+        performance
+        portability
+    SUPPRESS
+        ConfigurationNotChecked
+        constStatement
+        duplicateCondition
+        noExplicitConstructor
+        passedByValue
+        preprocessorErrorDirective
+        shadowVariable
+        unusedFunction
+        unusedPrivateFunction
+        unusedStructMember
+        unmatchedSuppression
+    FORCE
+    SOURCES
+        library/src
+    INCLUDE
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+        ${CMAKE_CURRENT_BINARY_DIR}/include
+        ${CMAKE_CURRENT_SOURCE_DIR}/library/include
+    DEFINE
+        CPPCHECK=1
+        __linux__=1
+)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
+# set CK project include directories
+include_directories(BEFORE
+    ${PROJECT_BINARY_DIR}/include
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/library/include
+    ${HIP_INCLUDE_DIRS}
+)
+SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
+if(BUILD_DEV)
+    add_compile_options(-Werror)
+    add_compile_options(-Weverything)
+endif()
+message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
+file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
+set(CK_DEVICE_INSTANCES)
+FOREACH(subdir_path ${dir_list})
+set(target_dir)
+IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
+    set(cmake_instance)
+    file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
+    set(add_inst 0)
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8")
+        #message("fp8 instance found!")
+        set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16")
+        #message("fp16 instance found!")
+        set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32")
+        #message("fp32 instance found!")
+        set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64")
+        #message("fp64 instance found!")
+        set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16")
+        #message("bf16 instance found!")
+        set(add_inst 1)
+    endif()
+    if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8")
+        #message("int8 instance found!")
+        set(add_inst 1)
+    endif()
+    if(NOT "${cmake_instance}" MATCHES "DTYPES")
+        #message("instance should be built for all types!")
+        set(add_inst 1)
+    endif()
+    if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
+        list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
+    endif()
+ENDIF()
+ENDFOREACH()
+add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
+add_subdirectory(library)
+if(NOT DEFINED INSTANCES_ONLY)
+ if(NOT DEFINED PROFILER_ONLY)
+   rocm_package_setup_component(tests
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME tests # Prevent -static suffix on package name
+   )
+   rocm_package_setup_component(examples
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME examples
+   )
+   add_subdirectory(example)
+   add_subdirectory(test)
+   rocm_package_setup_component(profiler
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME ckProfiler
+   )
+   add_subdirectory(profiler)
+  else()
+    #When building PROFILER_ONLY, label the package with GPU_ARCH
+    rocm_package_setup_component(profiler
+       LIBRARY_NAME composablekernel
+       PACKAGE_NAME ckProfiler_${GPU_ARCH}
+    )
+    add_subdirectory(profiler)
+  endif()
+endif()
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    VERSION "${version}"
+    COMPATIBILITY AnyNewerVersion
+)
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+    NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+rocm_install(FILES
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
+# Install CK version and configuration files
+rocm_install(FILES
+    ${PROJECT_BINARY_DIR}/include/ck/version.h
+    ${PROJECT_BINARY_DIR}/include/ck/config.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/
+)
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+set(CPACK_RPM_PACKAGE_LICENSE "MIT")
+rocm_create_package(
+    NAME composablekernel
+    DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+    MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
+    LDCONFIG
+    HEADER_ONLY
+)
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
+# Composable Kernel Developers and Contributors
+This is the list of developers and contributors to Composable Kernel library
+## Developers
+[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2023
+[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2023
+[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), [Astha Rai](https://github.com/arai713), [Shi YanXing](https://github.com/Yanxing-Shi), 2022-2023
+[Hari Sadasivan](https://github.com/hsadasiv), [Bartlomiej Kocot](https://github.com/bartekxk), [Bartlomiej Wroblewski](https://github.com/bwroblew), 2023
+Hanwen Chang, 2019-2021,
+Tejash Shah, 2019-2020
+Xiaoyan Zhou, 2020
+[Jianfeng Yan](https://github.com/j4yan), 2021-2022
+## Product Manager
+[Jun Liu](https://github.com/junliume)
+## Contributors
+[Dan Yao](https://github.com/danyao12), [Guangzhao Lu](https://github.com/guangzlu), [Raman Jana](https://github.com/ramjana), [Jehandad Khan](https://github.com/JehandadKhan), [Wen-Heng (Jack) Chung](https://github.com/whchung)
+## Acknowledgement
+CK team works closely with Meta [AITemplate](https://github.com/facebookincubator/AITemplate) team ([Bing Xu](https://github.com/antinucleon), [Hao Lu](https://github.com/hlu1), [Ying Zhang](https://github.com/ipiszy), etc). Most of the lucrative graph optimization opportunities in ML models were identified by AITemplate team, and we also co-designed many high performance fused kernels for AMD GPUs. Without this collaboration, CK would not reach its current potential.
--- a/Config.cmake.in
+++ b/Config.cmake.in
+@PACKAGE_INIT@
+set(_composable_kernel_supported_components device_operations utility)
+foreach(_comp ${composable_kernel_FIND_COMPONENTS})
+	if(NOT _comp IN_LIST _composable_kernel_supported_components)
+		set(composable_kernel_FOUND False)
+		set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
+	endif()
+	include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
+endforeach()
--- a/Dockerfile
+++ b/Dockerfile
+FROM ubuntu:20.04
+ARG DEBIAN_FRONTEND=noninteractive
+ARG ROCMVERSION=5.6
+ARG compiler_version=""
+ARG compiler_commit=""
+RUN set -xe
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
+RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
+# Add rocm repository
+RUN chmod 1777 /tmp
+RUN apt-get update
+RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
+ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
+RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
+RUN wget https://repo.radeon.com/amdgpu-install/5.6/ubuntu/focal/amdgpu-install_5.6.50600-1_all.deb  --no-check-certificate
+RUN apt-get update && \
+DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    ./amdgpu-install_5.6.50600-1_all.deb
+RUN if [ "$ROCMVERSION" != "5.7" ]; then \
+        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+        sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
+        sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
+    elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
+        apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
+        amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \
+    elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "rc1" ]; then \
+        sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
+        apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
+        sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.7 rel-19 > /etc/apt/sources.list.d/rocm-build.list' && \
+        amdgpu-repo --amdgpu-build=1637781; \
+    fi
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
+RUN amdgpu-install -y --usecase=rocm --no-dkms
+# Install dependencies
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    build-essential \
+    ccache \
+    cmake \
+    git \
+    hip-rocclr \
+    jq \
+    libelf-dev \
+    libncurses5-dev \
+    libnuma-dev \
+    libpthread-stubs0-dev \
+    llvm-amdgpu \
+    pkg-config \
+    python \
+    python3 \
+    python3-dev \
+    python3-pip \
+    sshpass \
+    software-properties-common \
+    vim \
+    nano \
+    zlib1g-dev \
+    openssh-server \
+    clang-format-12 \
+    kmod && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+#Install latest version of cmake
+RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
+RUN gunzip /usr/local/bin/ninja.gz
+RUN chmod a+x /usr/local/bin/ninja
+RUN git clone https://github.com/nico/ninjatracing.git
+RUN apt purge --auto-remove -y cmake
+RUN apt update
+RUN apt install -y software-properties-common lsb-release
+RUN apt clean all
+RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+RUN apt install -y kitware-archive-keyring
+RUN rm /etc/apt/trusted.gpg.d/kitware.gpg
+RUN apt install -y cmake
+# Setup ubsan environment to printstacktrace
+RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
+ENV UBSAN_OPTIONS=print_stacktrace=1
+# Install an init system
+RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
+RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
+ARG PREFIX=/opt/rocm
+# Install packages for processing the performance results
+RUN pip3 install --upgrade pip
+RUN pip3 install sqlalchemy==1.4.46
+RUN pip3 install pymysql
+RUN pip3 install pandas
+RUN pip3 install setuptools-rust
+RUN pip3 install sshtunnel
+# Setup ubsan environment to printstacktrace
+ENV UBSAN_OPTIONS=print_stacktrace=1
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+RUN groupadd -f render
+# Install the new rocm-cmake version
+RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git  && \
+  cd rocm-cmake && mkdir build && cd build && \
+  cmake  .. && cmake --build . && cmake --build . --target install
+WORKDIR /
+ENV compiler_version=$compiler_version
+ENV compiler_commit=$compiler_commit
+RUN sh -c "echo compiler version = '$compiler_version'"
+RUN sh -c "echo compiler commit = '$compiler_commit'"
+RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" = "" ]; then \
+        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
+        cd llvm-project && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+        make -j 8 ; \
+    else echo "using the release compiler"; \
+    fi
+RUN if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler_commit" != "" ]; then \
+        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
+        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+        make -j 8 ; \
+    else echo "using the release compiler"; \
+    fi
+#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
+#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
--- a/Jenkinsfile
+++ b/Jenkinsfile
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2018-    , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang)
+Copyright (c) 2019-    , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang)
+Copyright (c) 2022-    , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan)
+Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang)
+Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah)
+Copyright (c) 2020     , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
+Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
+SPDX-License-Identifier: MIT
+Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
+# Composable Kernel
+## Methodology
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+CK utilizes two concepts to achieve performance portability and code maintainability:
+* A tile-based programming model
+* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
+![ALT](/docs/data/ck_component.png "CK Components")
+## Code Structure
+Current CK library are structured into 4 layers:
+* "Templated Tile Operators" layer
+* "Templated Kernel and Invoker" layer
+* "Instantiated Kernel and Invoker" layer
+* "Client API" layer
+![ALT](/docs/data/ck_layer.png "CK Layers")
+## Documentation
+Run the steps below to build documentation locally.
+```
+cd docs
+pip3 install -r sphinx/requirements.txt
+python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+```
+## Contributors
+The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
+## Citation
+If you use CK, please use following citations:
+* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
+* [CITATION.cff](/CITATION.cff)
+## License
+CK is released under the MIT license. [License File](/LICENSE)
+# Build CK
+## Build docker image
+```bash
+DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
+```
+Pre-built dockers are available from this public repo: 
+https://hub.docker.com/r/rocm/composable_kernel/tags
+## Launch docker
+```bash
+docker run                                     \
+-it                                            \
+--privileged                                   \
+--group-add sudo                               \
+-w /root/workspace                             \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace  \
+ck:latest                                      \
+/bin/bash
+```
+## Build CK
+```bash
+mkdir build && cd build
+# Need to specify target ID, example below is for gfx908 and gfx90a
+cmake                                                                                             \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+-D CMAKE_BUILD_TYPE=Release                                                                       \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
+..
+```
+If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the 
+current compiler.
+Additional cmake flags can be used to significantly speed-up the build:
+INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library
+while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency.
+DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances 
+of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
+DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl 
+instances. Those instances are only needed for the NAVI2x platforms.
+### Build examples and tests
+```bash
+ make -j examples tests
+ make test
+```
+Instructions for running each individual examples are under [example](/example)
+## Build ckProfiler
+```bash
+ make -j ckProfiler
+```
+Instructions for running ckProfiler are under [profiler](/profiler)
+## Install CK
+```bash
+make install
+```
+## Using CK as pre-built kernel library
+Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
+## Contributing
+When you contribute to Composable Kernel, make sure to run `clang-format` on all the changed files. We highly recommend using git hooks that are managed by the `pre-commit` framework. To install hooks, run:
+```bash
+sudo script/install_precommit.sh
+```
+This way, `pre-commit` will add the appropriate hooks to your local repository and automatically run `clang-format` (and possibly additional checks) before any commit is created.
+If you need to uninstall hooks from the repository, you can do so by running the following command:
+```bash
+script/uninstall_precommit.sh
+```
+If for any reason, you need to temporarily disable precommit hooks, you can add the `--no-verify` option to the `git commit` command.
+## Caveat
+### Kernel Timing and Verification
+CK's own kernel timer will warn up kernel once, and then run it multiple times
+to get average kernel time. For some kernels that use atomic add, this will cause
+output buffer to be accumulated multiple times, causing verification failure.
+To work around it, do not use CK's own timer and do verification at the same time.
+CK's own timer and verification in each example and ckProfiler can be enabled or
+disabled from command line.
--- a/client_example/01_gemm/CMakeLists.txt
+++ b/client_example/01_gemm/CMakeLists.txt
+add_executable(client_gemm gemm.cpp)
+target_link_libraries(client_gemm PRIVATE composable_kernel::device_operations)
--- a/client_example/01_gemm/gemm.cpp
+++ b/client_example/01_gemm/gemm.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+using ADataType = F16;
+using BDataType = F16;
+using CDataType = F16;
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 7)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+        StrideA = std::stoi(argv[4]);
+        StrideB = std::stoi(argv[5]);
+        StrideC = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGemm<ALayout,
+                                                 BLayout,
+                                                 CLayout,
+                                                 ADataType,
+                                                 BDataType,
+                                                 CDataType,
+                                                 ck::tensor_operation::element_wise::PassThrough,
+                                                 ck::tensor_operation::element_wise::PassThrough,
+                                                 ck::tensor_operation::element_wise::PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        StrideC,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        c_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        StrideC,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        c_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
+++ b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
+add_custom_target(client_gemm_fastgelu_examples)
+add_executable(client_gemm_add_add_fastgelu gemm_add_add_fastgelu.cpp)
+target_link_libraries(client_gemm_add_add_fastgelu PRIVATE composable_kernel::device_operations)
+add_executable(client_gemm_add_fastgelu gemm_add_fastgelu.cpp)
+target_link_libraries(client_gemm_add_fastgelu PRIVATE composable_kernel::device_operations)
+add_executable(client_gemm_fastgelu gemm_fastgelu.cpp)
+target_link_libraries(client_gemm_fastgelu PRIVATE composable_kernel::device_operations)
+add_dependencies(client_gemm_fastgelu_examples client_gemm_add_add_fastgelu client_gemm_add_fastgelu
+                 client_gemm_fastgelu)
+add_custom_target(client_gemm_fastgelu_generic_examples)
+add_executable(client_gemm_add_add_fastgelu_generic gemm_add_add_fastgelu_generic.cpp)
+target_link_libraries(client_gemm_add_add_fastgelu_generic PRIVATE composable_kernel::device_operations)
+add_executable(client_gemm_add_fastgelu_generic gemm_add_fastgelu_generic.cpp)
+target_link_libraries(client_gemm_add_fastgelu_generic PRIVATE composable_kernel::device_operations)
+add_executable(client_gemm_fastgelu_generic gemm_fastgelu_generic.cpp)
+target_link_libraries(client_gemm_fastgelu_generic PRIVATE composable_kernel::device_operations)
+add_dependencies(client_gemm_fastgelu_generic_examples client_gemm_add_add_fastgelu_generic 
+                 client_gemm_add_fastgelu_generic client_gemm_fastgelu_generic)
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+using ADataType  = F16;
+using BDataType  = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using EDataType  = F16;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout  = Row;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 9)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+        StrideA  = std::stoi(argv[4]);
+        StrideB  = std::stoi(argv[5]);
+        StrideD0 = std::stoi(argv[6]);
+        StrideD1 = std::stoi(argv[7]);
+        StrideE  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+        exit(0);
+    }
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+                                      f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+    SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
+                                      f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout, D1Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType, D1DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddAddFastGelu>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
+                                       d1_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 2>{StrideD0, StrideD1},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
+                                       d1_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 2>{StrideD0, StrideD1},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+#include <stdexcept>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+using ADataType  = F16;
+using BDataType  = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using EDataType  = F16;
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout  = Row;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 9)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+        StrideA  = std::stoi(argv[4]);
+        StrideB  = std::stoi(argv[5]);
+        StrideD0 = std::stoi(argv[6]);
+        StrideD1 = std::stoi(argv[7]);
+        StrideE  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+        exit(0);
+    }
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+                                      f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+    SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
+                                      f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout, D1Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType, D1DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddAddFastGelu>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+    // get generic instance
+    auto& op_ptr = op_ptrs[0];
+    std::cout << "Run the generic instance without timing: " << op_ptr->GetTypeString()
+              << std::endl;
+    // run the generic instance
+    auto argument_ptr =
+        op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
+                                                               d1_m_n_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    M,
+                                    N,
+                                    K,
+                                    StrideA,
+                                    StrideB,
+                                    std::array<ck::index_t, 2>{StrideD0, StrideD1},
+                                    StrideE,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+    auto invoker_ptr = op_ptr->MakeInvokerPointer();
+    if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+    {
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+    }
+    else
+    {
+        throw std::runtime_error(
+            "Generic instance should be suitable for various input lengths/strides");
+    }
+    std::cout << "Done" << std::endl;
+    return 0;
+}