Unverified Commit e1a5137e authored by arai713's avatar arai713 Committed by GitHub
Browse files

Merge branch 'develop' into transpose_5d

parents eb57178d 718065eb
# Documentation files
docs/* @saadrahim @LisaDelaney
*.md @saadrahim @LisaDelaney
*.rst @saadrahim @LisaDelaney
# Header directory
library/include/* @saadrahim @LisaDelaney
...@@ -3,7 +3,7 @@ repos: ...@@ -3,7 +3,7 @@ repos:
hooks: hooks:
- id: clang-format - id: clang-format
name: clang-format name: clang-format
entry: clang-format-10 -i --style=file entry: clang-format-12 -i --style=file
language: system language: system
types_or: [c++, inc] types_or: [c++, inc]
- id: copyright-year-checker - id: copyright-year-checker
......
cmake_minimum_required(VERSION 3.14) cmake_minimum_required(VERSION 3.14)
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT MSVC_IDE AND NOT CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
# Default installation path
if(WIN32)
set(CMAKE_INSTALL_PREFIX "/opt/rocm/x86_64-w64-mingw32" CACHE PATH "")
else()
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
endif()
set(version 1.1.0)
# Check support for CUDA/HIP in Cmake # Check support for CUDA/HIP in Cmake
project(composable_kernel) project(composable_kernel VERSION ${version})
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
if (DTYPES) if (DTYPES)
add_definitions(-DDTYPES) add_definitions(-DDTYPES)
if (DTYPES MATCHES "int8") if (DTYPES MATCHES "int8")
add_definitions(-D__int8__) add_definitions(-DCK_ENABLE_INT8)
endif() set(CK_ENABLE_INT8 "ON")
if (DTYPES MATCHES "fp8") endif()
add_definitions(-D__fp8__) if (DTYPES MATCHES "fp8")
endif() add_definitions(-DCK_ENABLE_FP8)
if (DTYPES MATCHES "fp16") set(CK_ENABLE_FP8 "ON")
add_definitions(-D__fp16__) add_compile_options(-Wno-bit-int-extension)
endif() endif()
if (DTYPES MATCHES "fp32") if (DTYPES MATCHES "bf8")
add_definitions(-D__fp32__) add_definitions(-DCK_ENABLE_BF8)
endif() set(CK_ENABLE_BF8 "ON")
if (DTYPES MATCHES "fp64") add_compile_options(-Wno-bit-int-extension)
add_definitions(-D__fp64__) endif()
endif() if (DTYPES MATCHES "fp16")
if (DTYPES MATCHES "bf16") add_definitions(-DCK_ENABLE_FP16)
add_definitions(-D__bf16__) set(CK_ENABLE_FP16 "ON")
endif() endif()
message("DTYPES macro set to ${DTYPES}") if (DTYPES MATCHES "fp32")
add_definitions(-DCK_ENABLE_FP32)
set(CK_ENABLE_FP32 "ON")
endif()
if (DTYPES MATCHES "fp64")
add_definitions(-DCK_ENABLE_FP64)
set(CK_ENABLE_FP64 "ON")
endif()
if (DTYPES MATCHES "bf16")
add_definitions(-DCK_ENABLE_BF16)
set(CK_ENABLE_BF16 "ON")
endif()
message("DTYPES macro set to ${DTYPES}")
else() else()
add_definitions(-D__int8__ -D__fp8__ -D__fp16__ -D__fp32__ -D__fp64__ -D__bf16__) add_definitions(-DCK_ENABLE_INT8 -DCK_ENABLE_FP8 -DCK_ENABLE_BF8 -DCK_ENABLE_FP16 -DCK_ENABLE_FP32 -DCK_ENABLE_FP64 -DCK_ENABLE_BF16)
set(CK_ENABLE_ALL_DTYPES "ON")
add_compile_options(-Wno-bit-int-extension) # enable fp8 and bf8
endif() endif()
if(DL_KERNELS) if(DL_KERNELS)
add_definitions(-DDL_KERNELS) add_definitions(-DDL_KERNELS)
set(CK_ENABLE_DL_KERNELS "ON")
endif() endif()
if(INSTANCES_ONLY) if(INSTANCES_ONLY)
add_definitions(-DINSTANCES_ONLY) add_definitions(-DINSTANCES_ONLY)
set(CK_ENABLE_INSTANCES_ONLY "ON")
endif() endif()
# CK config file to record supported datatypes, etc.
configure_file("${PROJECT_SOURCE_DIR}/include/ck/config.h.in" "${PROJECT_BINARY_DIR}/include/ck/config.h")
# CK version file to record release version as well as git commit hash
find_package(Git REQUIRED)
execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD OUTPUT_VARIABLE COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
configure_file("${PROJECT_SOURCE_DIR}/include/ck/version.h.in" "${PROJECT_BINARY_DIR}/include/ck/version.h")
enable_testing() enable_testing()
set(ROCM_SYMLINK_LIBS OFF) set(ROCM_SYMLINK_LIBS OFF)
...@@ -50,8 +88,10 @@ include(ROCMInstallSymlinks) ...@@ -50,8 +88,10 @@ include(ROCMInstallSymlinks)
include(ROCMCreatePackage) include(ROCMCreatePackage)
include(CheckCXXCompilerFlag) include(CheckCXXCompilerFlag)
include(ROCMCheckTargetIds) include(ROCMCheckTargetIds)
rocm_setup_version(VERSION 0.2.0)
include(TargetFlags) include(TargetFlags)
rocm_setup_version(VERSION ${version})
list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip) list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
message("GPU_TARGETS= ${GPU_TARGETS}") message("GPU_TARGETS= ${GPU_TARGETS}")
...@@ -65,6 +105,7 @@ if(NOT PROFILER_ONLY) ...@@ -65,6 +105,7 @@ if(NOT PROFILER_ONLY)
TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102") TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
else() else()
add_definitions(-DPROFILER_ONLY) add_definitions(-DPROFILER_ONLY)
set(GPU_TARGETS "" CACHE STRING "" FORCE)
if(GPU_TARGETS) if(GPU_TARGETS)
message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11") message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11")
endif() endif()
...@@ -77,11 +118,12 @@ else() ...@@ -77,11 +118,12 @@ else()
else() else()
message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11") message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11")
endif() endif()
set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
endif() endif()
message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}") message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " ") set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
if(GPU_TARGETS) if(GPU_TARGETS)
message("Building CK for the following targets: ${GPU_TARGETS}") message("Building CK for the following targets: ${GPU_TARGETS}")
...@@ -315,13 +357,14 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) ...@@ -315,13 +357,14 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
# set CK project include directories
include_directories(BEFORE include_directories(BEFORE
${PROJECT_BINARY_DIR}/include
${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/library/include ${PROJECT_SOURCE_DIR}/library/include
${HIP_INCLUDE_DIRS} ${HIP_INCLUDE_DIRS}
) )
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV) if(BUILD_DEV)
add_compile_options(-Werror) add_compile_options(-Werror)
...@@ -341,35 +384,39 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ...@@ -341,35 +384,39 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance) file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
set(add_inst 0) set(add_inst 0)
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8") if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8")
#message("fp8 instance found!") #message("fp8 instance found!")
set(add_inst 1) set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf8\" " AND DTYPES MATCHES "bf8")
#message("bf8 instance found!")
set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16") if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16")
#message("fp16 instance found!") #message("fp16 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32") if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32")
#message("fp32 instance found!") #message("fp32 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64") if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64")
#message("fp64 instance found!") #message("fp64 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16") if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16")
#message("bf16 instance found!") #message("bf16 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8") if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8")
#message("int8 instance found!") #message("int8 instance found!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(NOT "${cmake_instance}" MATCHES "DTYPES") if(NOT "${cmake_instance}" MATCHES "DTYPES")
#message("instance should be built for all types!") #message("instance should be built for all types!")
set(add_inst 1) set(add_inst 1)
endif() endif()
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES) if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance) list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
endif() endif()
ENDIF() ENDIF()
ENDFOREACH() ENDFOREACH()
...@@ -409,7 +456,6 @@ endif() ...@@ -409,7 +456,6 @@ endif()
#Create an interface target for the include only files and call it "composablekernels" #Create an interface target for the include only files and call it "composablekernels"
include(CMakePackageConfigHelpers) include(CMakePackageConfigHelpers)
set(version 1.0.0)
write_basic_package_version_file( write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake" "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
VERSION "${version}" VERSION "${version}"
...@@ -417,9 +463,9 @@ write_basic_package_version_file( ...@@ -417,9 +463,9 @@ write_basic_package_version_file(
) )
configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
NO_CHECK_REQUIRED_COMPONENTS_MACRO NO_CHECK_REQUIRED_COMPONENTS_MACRO
) )
rocm_install(FILES rocm_install(FILES
...@@ -428,6 +474,13 @@ rocm_install(FILES ...@@ -428,6 +474,13 @@ rocm_install(FILES
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
) )
# Install CK version and configuration files
rocm_install(FILES
${PROJECT_BINARY_DIR}/include/ck/version.h
${PROJECT_BINARY_DIR}/include/ck/config.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/
)
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RPM_PACKAGE_LICENSE "MIT") set(CPACK_RPM_PACKAGE_LICENSE "MIT")
......
...@@ -16,25 +16,12 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl ...@@ -16,25 +16,12 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
RUN wget https://repo.radeon.com/amdgpu-install/5.6/ubuntu/focal/amdgpu-install_5.6.50600-1_all.deb --no-check-certificate RUN wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate
RUN apt-get update && \ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_5.7.50700-1_all.deb
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
./amdgpu-install_5.6.50600-1_all.deb RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
RUN if [ "$ROCMVERSION" != "5.7" ]; then \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "" ] || [ "$compiler_version" = "amd-stg-open" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
amdgpu-repo --amdgpu-build=1609671 --rocm-build=compute-rocm-npi-mi300/1354; \
elif [ "$ROCMVERSION" = "5.7" ] && [ "$compiler_version" = "rc1" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.7-20.04-1_all.deb" && \
apt update && apt-get install -y ./amdgpu-install-internal_5.7-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.7 rel-19 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=1637781; \
fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
RUN amdgpu-install -y --usecase=rocm --no-dkms RUN amdgpu-install -y --usecase=rocm --no-dkms
...@@ -63,7 +50,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -63,7 +50,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
nano \ nano \
zlib1g-dev \ zlib1g-dev \
openssh-server \ openssh-server \
clang-format-10 \ clang-format-12 \
kmod && \ kmod && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
...@@ -73,15 +60,8 @@ RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releas ...@@ -73,15 +60,8 @@ RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releas
RUN gunzip /usr/local/bin/ninja.gz RUN gunzip /usr/local/bin/ninja.gz
RUN chmod a+x /usr/local/bin/ninja RUN chmod a+x /usr/local/bin/ninja
RUN git clone https://github.com/nico/ninjatracing.git RUN git clone https://github.com/nico/ninjatracing.git
RUN apt purge --auto-remove -y cmake # Update the cmake to the latest version
RUN apt update RUN pip install --upgrade cmake
RUN apt install -y software-properties-common lsb-release
RUN apt clean all
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
RUN apt install -y kitware-archive-keyring
RUN rm /etc/apt/trusted.gpg.d/kitware.gpg
RUN apt install -y cmake
# Setup ubsan environment to printstacktrace # Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
......
...@@ -33,7 +33,7 @@ def runShell(String command){ ...@@ -33,7 +33,7 @@ def runShell(String command){
def getDockerImageName(){ def getDockerImageName(){
def img def img
if (params.ROCMVERSION != "5.7"){ if (params.ROCMVERSION != "6.0"){
if (params.COMPILER_VERSION == "") { if (params.COMPILER_VERSION == "") {
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
} }
...@@ -210,6 +210,9 @@ def cmake_build(Map conf=[:]){ ...@@ -210,6 +210,9 @@ def cmake_build(Map conf=[:]){
} else{ } else{
setup_args = ' -DBUILD_DEV=On' + setup_args setup_args = ' -DBUILD_DEV=On' + setup_args
} }
if (params.DL_KERNELS){
setup_args = setup_args + " -DDL_KERNELS=ON "
}
if(build_type_debug){ if(build_type_debug){
setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args
...@@ -367,8 +370,6 @@ def runCKProfiler(Map conf=[:]){ ...@@ -367,8 +370,6 @@ def runCKProfiler(Map conf=[:]){
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
timeout(time: 24, unit: 'HOURS') timeout(time: 24, unit: 'HOURS')
{ {
//cmake_build(conf)
//instead of building, just unstash the ckProfiler and install it
sh """ sh """
rm -rf build rm -rf build
mkdir build mkdir build
...@@ -612,9 +613,9 @@ def process_results(Map conf=[:]){ ...@@ -612,9 +613,9 @@ def process_results(Map conf=[:]){
} }
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=rc1 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=5.7;COMPILER_VERSION=
0 21 * * * % ROCMVERSION=5.6;COMPILER_VERSION=;COMPILER_COMMIT= 0 21 * * * % ROCMVERSION=5.7;COMPILER_VERSION=;COMPILER_COMMIT=
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : "" 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
pipeline { pipeline {
agent none agent none
...@@ -631,8 +632,8 @@ pipeline { ...@@ -631,8 +632,8 @@ pipeline {
description: "Force building docker image (default: false), set to true if docker image needs to be updated.") description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
string( string(
name: 'ROCMVERSION', name: 'ROCMVERSION',
defaultValue: '5.6', defaultValue: '5.7',
description: 'Specify which ROCM version to use: 5.6 (default).') description: 'Specify which ROCM version to use: 5.7 (default).')
string( string(
name: 'COMPILER_VERSION', name: 'COMPILER_VERSION',
defaultValue: '', defaultValue: '',
...@@ -649,6 +650,10 @@ pipeline { ...@@ -649,6 +650,10 @@ pipeline {
name: "RUN_FULL_QA", name: "RUN_FULL_QA",
defaultValue: false, defaultValue: false,
description: "Select whether to run small set of performance tests (default) or full QA") description: "Select whether to run small set of performance tests (default) or full QA")
booleanParam(
name: "DL_KERNELS",
defaultValue: false,
description: "Select whether to build DL kernels (default: OFF)")
} }
environment{ environment{
dbuser = "${dbuser}" dbuser = "${dbuser}"
...@@ -663,15 +668,12 @@ pipeline { ...@@ -663,15 +668,12 @@ pipeline {
} }
stages{ stages{
stage("Build Docker"){ stage("Build Docker"){
//when {
// beforeAgent true
// expression { params.BUILD_DOCKER.toBoolean() }
//}
parallel{ parallel{
stage('Docker /opt/rocm'){ stage('Docker /opt/rocm'){
agent{ label rocmnode("nogpu") } agent{ label rocmnode("nogpu") }
steps{ steps{
buildDocker('/opt/rocm') buildDocker('/opt/rocm')
cleanWs()
} }
} }
} }
...@@ -689,10 +691,11 @@ pipeline { ...@@ -689,10 +691,11 @@ pipeline {
-o -iname \'*.cpp.in\' \ -o -iname \'*.cpp.in\' \
-o -iname \'*.cl\' \ -o -iname \'*.cl\' \
| grep -v 'build/' \ | grep -v 'build/' \
| xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-10 -style=file {} | diff - {}\'" | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\'"
} }
steps{ steps{
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true) buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
cleanWs()
} }
} }
} }
...@@ -715,6 +718,7 @@ pipeline { ...@@ -715,6 +718,7 @@ pipeline {
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
} }
} }
stage("Build CK and run Tests on MI100/MI200") stage("Build CK and run Tests on MI100/MI200")
...@@ -730,6 +734,7 @@ pipeline { ...@@ -730,6 +734,7 @@ pipeline {
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
} }
} }
stage("Build CK and run Tests on Navi21") stage("Build CK and run Tests on Navi21")
...@@ -742,10 +747,10 @@ pipeline { ...@@ -742,10 +747,10 @@ pipeline {
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
} }
} }
stage("Build CK and run Tests on Navi32") stage("Build CK and run Tests on Navi32")
...@@ -756,12 +761,12 @@ pipeline { ...@@ -756,12 +761,12 @@ pipeline {
} }
agent{ label rocmnode("navi32") } agent{ label rocmnode("navi32") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DDTYPES="fp16;fp32;bf16" -DGPU_TARGETS="gfx1101" """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDTYPES="fp16;fp32;bf16" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
cleanWs()
} }
} }
} }
...@@ -784,6 +789,7 @@ pipeline { ...@@ -784,6 +789,7 @@ pipeline {
} }
steps{ steps{
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
cleanWs()
} }
} }
stage("Run ckProfiler: gfx90a") stage("Run ckProfiler: gfx90a")
...@@ -799,6 +805,7 @@ pipeline { ...@@ -799,6 +805,7 @@ pipeline {
} }
steps{ steps{
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
cleanWs()
} }
} }
} }
...@@ -811,6 +818,7 @@ pipeline { ...@@ -811,6 +818,7 @@ pipeline {
agent { label 'mici' } agent { label 'mici' }
steps{ steps{
process_results() process_results()
cleanWs()
} }
} }
} }
......
...@@ -100,6 +100,10 @@ int main(int argc, char* argv[]) ...@@ -100,6 +100,10 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N + std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
...@@ -153,6 +157,10 @@ int main(int argc, char* argv[]) ...@@ -153,6 +157,10 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
...@@ -191,6 +191,12 @@ int main(int argc, char* argv[]) ...@@ -191,6 +191,12 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
...@@ -187,6 +187,12 @@ int main(int argc, char* argv[]) ...@@ -187,6 +187,12 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
...@@ -129,6 +129,10 @@ int main(int argc, char* argv[]) ...@@ -129,6 +129,10 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte = std::size_t num_byte =
...@@ -184,6 +188,10 @@ int main(int argc, char* argv[]) ...@@ -184,6 +188,10 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp) add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp)
target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_operations) target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_operations)
add_executable(client_max_pool2d_bwd max_pool2d_bwd.cpp)
target_link_libraries(client_max_pool2d_bwd PRIVATE composable_kernel::device_operations)
add_executable(client_avg_pool3d_fwd avg_pool3d_fwd.cpp) add_executable(client_avg_pool3d_fwd avg_pool3d_fwd.cpp)
target_link_libraries(client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations) target_link_libraries(client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations)
\ No newline at end of file
add_executable(client_avg_pool3d_bwd avg_pool3d_bwd.cpp)
target_link_libraries(client_avg_pool3d_bwd PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp"
using DOutDataType = ck::half_t;
using DInDataType = ck::half_t;
using DOutLayout = ck::tensor_layout::convolution::NDHWC;
using DInLayout = ck::tensor_layout::convolution::NDHWC;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}, mMemSize_(mem_size)
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
void SetZero() const { (void)hipMemset(p_mem_, 0, mMemSize_); }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
std::size_t mMemSize_;
};
int main(int argc, char* argv[])
{
ck::index_t N = 2;
ck::index_t C = 32;
ck::index_t Z = 2;
ck::index_t Y = 2;
ck::index_t X = 2;
ck::index_t Di = 30;
ck::index_t Hi = 30;
ck::index_t Wi = 30;
ck::index_t window_stride_d = 2;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t window_dilation_d = 1;
ck::index_t window_dilation_h = 1;
ck::index_t window_dilation_w = 1;
ck::index_t in_left_pad_d = 1;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_d = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
// Pool API only support the order of NCDHW
std::vector<ck::index_t> in_length = {N, C, Di, Hi, Wi};
std::vector<ck::index_t> out_length = {N, C, Do, Ho, Wo};
std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w};
std::vector<ck::index_t> window_dilations{
window_dilation_d, window_dilation_h, window_dilation_w};
std::vector<ck::index_t> input_left_pads = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
std::size_t in_tensor_size = N * C * Di * Hi * Wi;
std::size_t out_tensor_size = N * C * Do * Ho * Wo;
// tensor layout = NDHWC
std::vector<ck::index_t> in_tensor_stride = {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C};
std::vector<ck::index_t> out_tensor_stride = {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C};
SimpleDeviceMem dout_device_buf(sizeof(DOutDataType) * out_tensor_size);
SimpleDeviceMem din_device_buf(sizeof(DInDataType) * in_tensor_size);
using DeviceOp = ck::tensor_operation::device::
DeviceAvgPoolBwd<3, DOutDataType, DInDataType, DOutLayout, DInLayout>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
out_length,
in_length,
out_tensor_stride,
in_tensor_stride,
window_spatial_lengths,
window_strides,
window_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
din_device_buf.SetZero();
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes =
in_tensor_size * sizeof(DInDataType) + out_tensor_size * sizeof(DOutDataType);
float gb_per_sec = num_bytes / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
// run the best intance
if(found)
{
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
out_length,
in_length,
out_tensor_stride,
in_tensor_stride,
window_spatial_lengths,
window_strides,
window_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
din_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
#include "ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using DOutDataType = ck::half_t;
using DInDataType = ck::half_t;
using IndexDataType = int32_t;
// We use pool3d to implement pool2d in this example
using InLayout = ck::tensor_layout::convolution::NDHWC;
using OutLayout = ck::tensor_layout::convolution::NDHWC;
constexpr ck::index_t InOutRank = 5;
constexpr ck::index_t WindowRank = 3;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
void TransformPool2dparamToPool3d(std::vector<ck::index_t>& input_lengths,
std::vector<ck::index_t>& window_lengths,
std::vector<ck::index_t>& output_lengths,
std::vector<ck::index_t>& input_stride,
std::vector<ck::index_t>& output_stride,
std::vector<ck::index_t>& indices_stride,
std::vector<ck::index_t>& window_strides,
std::vector<ck::index_t>& window_dilations,
std::vector<ck::index_t>& input_left_pads,
std::vector<ck::index_t>& input_right_pads,
std::vector<ck::index_t>& pooling_dims)
{
// NCHW to NCDHW
input_lengths.insert(input_lengths.begin() + 2, 1);
output_lengths.insert(output_lengths.begin() + 2, 1);
input_stride.insert(input_stride.begin() + 2, 0);
output_stride.insert(output_stride.begin() + 2, 0);
indices_stride.insert(indices_stride.begin() + 2, 0);
// YX to ZYX
window_lengths.insert(window_lengths.begin(), 1);
window_strides.insert(window_strides.begin(), 0);
window_dilations.insert(window_dilations.begin(), 0);
input_left_pads.insert(input_left_pads.begin(), 0);
input_right_pads.insert(input_right_pads.begin(), 0);
pooling_dims = {2, 3, 4};
}
int main(int argc, char* argv[])
{
ck::index_t N = 2;
ck::index_t C = 32;
ck::index_t Y = 2;
ck::index_t X = 2;
ck::index_t Hi = 30;
ck::index_t Wi = 30;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t window_dilation_h = 1;
ck::index_t window_dilation_w = 1;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
// Pool API only support the order of NCHW
std::vector<ck::index_t> in_length = {N, C, Hi, Wi};
std::vector<ck::index_t> out_length = {N, C, Ho, Wo};
std::vector<ck::index_t> window_spatial_lengths = {Y, X};
std::vector<ck::index_t> window_strides = {window_stride_h, window_stride_w};
std::vector<ck::index_t> window_dilations = {window_dilation_h, window_dilation_w};
std::vector<ck::index_t> input_left_pads = {in_left_pad_h, in_left_pad_w};
std::vector<ck::index_t> input_right_pads = {in_right_pad_h, in_right_pad_w};
std::vector<ck::index_t> pooling_dims = {2, 3};
std::size_t in_tensor_size = N * C * Hi * Wi;
std::size_t out_tensor_size = N * C * Ho * Wo;
// tensor layout = NHWC
std::vector<ck::index_t> in_tensor_stride = {C * Hi * Wi, 1, Wi * C, C};
std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
TransformPool2dparamToPool3d(in_length,
window_spatial_lengths,
out_length,
in_tensor_stride,
out_tensor_stride,
out_tensor_stride,
window_strides,
window_dilations,
input_left_pads,
input_right_pads,
pooling_dims);
SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
SimpleDeviceMem indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
SimpleDeviceMem dout_device_buf(sizeof(DOutDataType) * out_tensor_size);
SimpleDeviceMem din_device_buf(sizeof(DInDataType) * in_tensor_size);
// Generate index data from max pool forward
{
using MaxPoolFwdDeviceOp =
ck::tensor_operation::device::DevicePoolFwd<InOutRank,
WindowRank,
InDataType,
OutDataType,
IndexDataType,
InLayout,
OutLayout,
ck::ReduceTensorOp::MAX,
true>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
MaxPoolFwdDeviceOp>::GetInstances();
auto& op_ptr = op_ptrs[0];
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
in_length,
window_spatial_lengths,
out_length,
in_tensor_stride,
out_tensor_stride,
out_tensor_stride,
window_strides,
window_dilations,
input_left_pads,
input_right_pads,
pooling_dims);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
}
// Run MaxPool bwd
using MaxPoolBwdDeviceOp =
ck::tensor_operation::device::DeviceMaxPoolBwd<DOutDataType, IndexDataType, DInDataType>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
MaxPoolBwdDeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<InDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
out_tensor_size,
in_tensor_size,
window_spatial_lengths,
window_strides,
window_dilations);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = in_tensor_size * sizeof(DInDataType) +
out_tensor_size * sizeof(IndexDataType) +
out_tensor_size * sizeof(DOutDataType);
float gb_per_sec = num_bytes / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << "GB / s,"
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
// run the best intance
if(found)
{
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<InDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
out_tensor_size,
in_tensor_size,
window_spatial_lengths,
window_strides,
window_dilations);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
add_executable(client_image_to_column image_to_column.cpp)
target_link_libraries(client_image_to_column PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using InLayout = ck::tensor_layout::convolution::GNHWC;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1;
static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 28; // input H
static constexpr ck::index_t Wi = 28; // input W
static constexpr ck::index_t Ho = 28; // output H
static constexpr ck::index_t Wo = 28; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
std::array<ck::index_t, 2> in_spatial_lengths{Hi, Wi};
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space (G is dummy)
// However, CK's API only accept length and stride with order of GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 2> out_strides{Y * X * C, 1};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C);
using DeviceOp = ck::tensor_operation::device::
DeviceImageToColumn<NumDimSpatial, InLayout, InDataType, OutDataType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
in_strides,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(avg_time < best_avg_time)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_gb_per_sec
<< " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
in_strides,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
}
if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
add_executable(client_splitK_gemm splitK_gemm_fp16_f8.cpp)
target_link_libraries(client_splitK_gemm PRIVATE composable_kernel::device_operations)
endif()
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
using F8 = ck::f8_t;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
using ADataType = F8;
using BDataType = F16;
using CDataType = F16;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
// GEMM shape
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
ck::index_t KBatch = 1;
if(argc == 1)
{
// use default case
}
else if(argc == 8)
{
M = std::stoi(argv[1]);
N = std::stoi(argv[2]);
K = std::stoi(argv[3]);
StrideA = std::stoi(argv[4]);
StrideB = std::stoi(argv[5]);
StrideC = std::stoi(argv[6]);
KBatch = std::stoi(argv[7]);
}
else
{
printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideC, KBatch\n");
exit(0);
}
auto f_matrix_space_size =
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
using Layout = decltype(layout);
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
{
return (nRow - 1) * stride + nCol;
}
else
{
return (nCol - 1) * stride + nRow;
}
};
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<
ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{};
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
c_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op,
KBatch);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
c_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op,
KBatch);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
add_executable(client_grouped_gemm_fixed_nk_bias_fp16 grouped_gemm_fixed_nk_bias_fp16.cpp)
target_link_libraries(client_grouped_gemm_fixed_nk_bias_fp16 PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <iostream>
#include <vector>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_bias.hpp"
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Add = ck::tensor_operation::element_wise::Add;
using ADataType = F16;
using BDataType = F16;
using D0DataType = F32;
using DsDataType = ck::Tuple<D0DataType>;
using EDataType = F32;
using ALayout = Row;
using BLayout = Row;
using D0Layout = Row;
using DsLayout = ck::Tuple<D0Layout>;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = Add;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
int sum_of_m = 0;
Ms = {167, 183, 177, 181, 153, 139, 156, 173, 163, 150, 204, 184, 168, 156, 168, 148};
int group_count = Ms.size();
for(int i = 0; i < group_count; ++i)
{
Ns.push_back(768);
Ks.push_back(4608);
StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
sum_of_m += Ms[i];
}
auto f_matrix_space_size =
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
using Layout = decltype(layout);
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
{
return (nRow - 1) * stride + nCol;
}
else
{
return (nCol - 1) * stride + nRow;
}
};
std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, d0_dev_bufs, e_dev_bufs;
a_dev_bufs.reserve(group_count);
b_dev_bufs.reserve(group_count);
d0_dev_bufs.reserve(group_count);
e_dev_bufs.reserve(group_count);
std::vector<void*> p_e;
p_e.reserve(group_count);
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
gemm_descs.reserve(group_count);
std::vector<ck::tensor_operation::device::GroupedGemmKernelArgument<1>>
grouped_gemm_kernel_args_;
grouped_gemm_kernel_args_.reserve(group_count);
for(int i = 0; i < group_count; ++i)
{
a_dev_bufs.emplace_back(sizeof(ADataType) *
f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
b_dev_bufs.emplace_back(sizeof(BDataType) *
f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
d0_dev_bufs.emplace_back(sizeof(D0DataType) *
f_matrix_space_size(Ms[i], Ns[i], 0, D0Layout{}));
e_dev_bufs.emplace_back(sizeof(EDataType) *
f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
gemm_descs.push_back({sum_of_m, Ns[i], Ks[i], 1, StrideBs[i], 1, {0}});
p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
grouped_gemm_kernel_args_.push_back(
{a_dev_bufs[i].GetDeviceBuffer(),
b_dev_bufs[i].GetDeviceBuffer(),
std::array<const void*, 1>{d0_dev_bufs[i].GetDeviceBuffer()},
e_dev_bufs[i].GetDeviceBuffer(),
Ms[i],
Ns[i],
Ks[i],
StrideAs[i],
StrideBs[i],
std::array<ck::index_t, 1>{0},
StrideEs[i]});
}
using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
BLayout,
DsLayout,
ELayout,
ADataType,
BDataType,
DsDataType,
EDataType,
AElementOp,
BElementOp,
CDEElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{};
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
std::vector<const void*> p_a = {}, p_b = {};
std::vector<std::array<const void*, 1>> p_ds = {};
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
SimpleDeviceMem grouped_gemm_kernel_args_dev(
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()));
SimpleDeviceMem grouped_gemm_workspace_dev(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
std::string op_name = op_ptr->GetTypeString();
hipGetErrorString(hipMemcpy(grouped_gemm_kernel_args_dev.GetDeviceBuffer(),
grouped_gemm_kernel_args_.data(),
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()),
hipMemcpyHostToDevice));
op_ptr->SetWorkSpacePointer(argument_ptr.get(),
grouped_gemm_workspace_dev.GetDeviceBuffer());
op_ptr->SetDeviceKernelArgs(argument_ptr.get(),
grouped_gemm_kernel_args_dev.GetDeviceBuffer());
op_ptr->SetKBatch(argument_ptr.get(), 2);
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = 0, num_btype = 0;
for(std::size_t j = 0; j < gemm_descs.size(); ++j)
{
flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
sizeof(EDataType) * Ms[j] * Ns[j];
}
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment