diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5667695bb5500d44aac0a9aa3ad24e0f9b458c32 --- /dev/null +++ b/.gitignore @@ -0,0 +1,53 @@ +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch +*.ipch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# vim tags +tags +.tags +.*.swp + +# Editors +.vscode + +# build-in-source directory +build* + +# emacs temporary/backup files +.\#* +\#*\# +*~ + +# GDB temporary files +.gdb_history +install.dir* + +# directories containing generated documentation +docs/source/_build/ +docs/docBin/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..79c45a0db338cb645929380f4b275e6caa3aa1b9 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,24 @@ +# Change Log for Composable Kernel + +Full documentation for Composable Kernel is not yet available. + +## CK 0.1.1 for ROCm 5.5.0 + +### Fixed +- Fixed a bug in 6-dimensional kernels (#555). +- Fixed grouped ConvBwdWeight test case failure (#524). + +### Optimizations +- Improve proformance of normalization kernel + +### Added +- Added user tutorial (#563). +- Added more instances for irregular GEMM sizes (#560). +- Added inter-wave consumer-producer programming model for GEMM kernels (#310). +- Added multi-D GEMM client APIs (#534). +- Added multi-embeddings support (#542). +- Added Navi3x blockwise GEMM and real GEMM support (#541). +- Added Navi grouped ConvBwdWeight support (#505). + +### Changed +- Changed ... diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..d35fe9e5870f0a538c1775efd239dc729e11ab88 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,67 @@ +cff-version: 1.2.0 +title: Composable Kernel +message: If you use this software, please cite using the following metadata. +type: software +authors: + - given-names: Chao + family-names: Liu + email: chao.liu2@amd.com + affiliation: AMD + - given-names: Jing + family-names: Zhang + email: jing.zhang3@amd.com + affiliation: AMD + - given-names: Letao + family-names: Qin + email: letao.qin@amd.com + affiliation: AMD + - given-names: Qianfeng + family-names: Zhang + email: qianfeng.zhang@amd.com + affiliation: AMD + - given-names: Liang + family-names: Huang + email: carlus.huang@amd.com + affiliation: AMD + - given-names: Shaojie + family-names: Wang + email: shaojie.wang@amd.com + affiliation: AMD + - given-names: Anthony + family-names: Chang + email: antc@amd.com + affiliation: AMD + - given-names: Chunyu + family-names: Lai + email: chunyu.lai@amd.com + affiliation: AMD + - given-names: Illia + family-names: Silin + email: illia.silin@amd.com + affiliation: AMD + - given-names: Adam + family-names: Osewski + email: adam.osewski@amd.com + affiliation: AMD + - given-names: Poyen + family-names: Chen + email: poyen.chen@amd.com + affiliation: AMD + - given-names: Rosty + family-names: Geyyer + email: rosty.geyyer@amd.com + affiliation: AMD + - given-names: Hanwen + family-names: Chen + - given-names: Tejash + family-names: Shah + - given-names: Xiaoyan + family-names: Zhou + - given-names: Jianfeng + family-names: Yan +repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel' +abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++. +keywords: + - 'CK, Composable Kernel, Tensor Coordinate Transformation' +license: MIT +license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE diff --git a/CMakeLists.txt b/CMakeLists.txt index 306e6ca6491cbf8d6edca64dd887722107d59a58..f861e302039de8e2c1ae17bf7cb39361aff8535f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,39 @@ -cmake_minimum_required(VERSION 3.5) +cmake_minimum_required(VERSION 3.14) + +# Check support for CUDA/HIP in Cmake project(composable_kernel) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") +enable_testing() + +set(ROCM_SYMLINK_LIBS OFF) +find_package(ROCM REQUIRED PATHS /opt/rocm) + +include(ROCMInstallTargets) +include(ROCMPackageConfigHelpers) +include(ROCMSetupVersion) +include(ROCMInstallSymlinks) +include(ROCMCreatePackage) include(CheckCXXCompilerFlag) +rocm_setup_version(VERSION 0.2.0) +include(TargetFlags) +list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip) + +option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF) + +if(USE_BITINT_EXTENSION_INT4) + add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) + add_compile_options(-Wno-bit-int-extension) + message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}") +endif() + +## Threads +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) +link_libraries(Threads::Threads) + ## C++ enable_language(CXX) set(CMAKE_CXX_STANDARD 17) @@ -30,35 +59,44 @@ message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}") message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}") message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}") -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") link_libraries(${OpenMP_gomp_LIBRARY}) link_libraries(${OpenMP_pthread_LIBRARY}) ## HIP find_package(HIP REQUIRED) -message(STATUS "Build with HIP ${hip_VERSION}") - -## half -#find_path(HALF_INCLUDE_DIR half.hpp) -message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}") - -# CMAKE_CXX_FLAGS -SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") -if(BUILD_DEV) - string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything") +# Override HIP version in config.h, if necessary. +# The variables set by find_package() can't be overwritten, +# therefore let's use intermediate variables. +set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}") +set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}") +set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}") +if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR ) + set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}") + message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}") endif() -message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") +if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR ) + set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}") + message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}") +endif() +if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH ) + set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}") + message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}") +endif() +message(STATUS "Build with HIP ${HIP_VERSION}") +link_libraries(hip::device) +add_compile_definitions(__HIP_PLATFORM_HCC__=1) ## tidy include(EnableCompilerWarnings) -set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name) +set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name) if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+") - set(MIOPEN_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter) + set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter) # Enable tidy on hip -elseif(MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU") - set(MIOPEN_TIDY_ERRORS ALL) +elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU") + set(CK_TIDY_ERRORS ALL) endif() + include(ClangTidy) enable_clang_tidy( CHECKS @@ -150,13 +188,12 @@ enable_clang_tidy( -cppcoreguidelines-narrowing-conversions -altera-struct-pack-align -cppcoreguidelines-prefer-member-initializer - - ${MIOPEN_TIDY_CHECKS} - ${MIOPEN_TIDY_ERRORS} + ${CK_TIDY_CHECKS} + ${CK_TIDY_ERRORS} HEADER_FILTER "\.hpp$" EXTRA_ARGS - -DMIOPEN_USE_CLANG_TIDY + -DCK_USE_CLANG_TIDY ) include(CppCheck) @@ -180,19 +217,95 @@ enable_cppcheck( unmatchedSuppression FORCE SOURCES - host/host_tensor/src - host/driver_offline/src - composable_kernel/src/kernel_wrapper + library/src INCLUDE - host/host_tensor/include - host/solver/include - host/driver_offline/include - composable_kernel/include/* ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_BINARY_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/library/include DEFINE CPPCHECK=1 __linux__=1 ) -add_subdirectory(host) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) + +include_directories(BEFORE + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/library/include + ${HIP_INCLUDE_DIRS} +) + + +SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") +if(BUILD_DEV) + add_compile_options(-Werror) + add_compile_options(-Weverything) +endif() +message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") + +add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR}) + +file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp") +file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*) +set(CK_DEVICE_INSTANCES) +FOREACH(subdir_path ${dir_list}) + IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}") + list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance) + ENDIF() +ENDFOREACH() +add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES}) + +rocm_package_setup_component(tests + LIBRARY_NAME composablekernel + PACKAGE_NAME tests # Prevent -static suffix on package name +) + +rocm_package_setup_component(examples + LIBRARY_NAME composablekernel + PACKAGE_NAME examples +) + +rocm_package_setup_component(profiler + LIBRARY_NAME composablekernel + PACKAGE_NAME ckProfiler +) + +add_subdirectory(library) +add_subdirectory(example) +add_subdirectory(test) +add_subdirectory(profiler) + +#Create an interface target for the include only files and call it "composablekernels" +include(CMakePackageConfigHelpers) + +set(version 1.0.0) +write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake" + VERSION "${version}" + COMPATIBILITY AnyNewerVersion +) + +configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel + NO_CHECK_REQUIRED_COMPONENTS_MACRO +) + +rocm_install(FILES + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel +) + +set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") +set(CPACK_RPM_PACKAGE_LICENSE "MIT") + +rocm_create_package( + NAME composablekernel + DESCRIPTION "High Performance Composable Kernel for AMD GPUs" + MAINTAINER "MIOpen Kernels Dev Team " + LDCONFIG + HEADER_ONLY +) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 0000000000000000000000000000000000000000..8ccfe99c3cc73b643f8b92cb654005e54c0774bd --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,31 @@ +# Composable Kernel Developers and Contributors + +This is the list of developers and contributors to Composable Kernel library + + +## Developers +[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2022 + +[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022 + +[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), 2022 + +Hanwen Chang, 2019-2021, + +Tejash Shah, 2019-2020 + +Xiaoyan Zhou, 2020 + +[Jianfeng Yan](https://github.com/j4yan), 2021-2022 + + +## Product Manager +[Jun Liu](https://github.com/junliume) + + +## Contributors +[Dan Yao](https://github.com/danyao12), [Guangzhao Lu](https://github.com/guangzlu), [Raman Jana](https://github.com/ramjana), [Jehandad Khan](https://github.com/JehandadKhan), [Wen-Heng (Jack) Chung](https://github.com/whchung) + + +## Acknowledgement +CK team works closely with Meta [AITemplate](https://github.com/facebookincubator/AITemplate) team ([Bing Xu](https://github.com/antinucleon), [Hao Lu](https://github.com/hlu1), [Ying Zhang](https://github.com/ipiszy), etc). Most of the lucrative graph optimization opportunities in ML models were identified by AITemplate team, and we also co-designed many high performance fused kernels for AMD GPUs. Without this collaboration, CK would not reach its current potential. diff --git a/Config.cmake.in b/Config.cmake.in new file mode 100644 index 0000000000000000000000000000000000000000..02978cd4dd49f53188080f1a0404cf87bfd52d7b --- /dev/null +++ b/Config.cmake.in @@ -0,0 +1,11 @@ +@PACKAGE_INIT@ + +set(_composable_kernel_supported_components device_operations utility) + +foreach(_comp ${composable_kernel_FIND_COMPONENTS}) + if(NOT _comp IN_LIST _composable_kernel_supported_components) + set(composable_kernel_FOUND False) + set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}") + endif() + include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake") +endforeach() diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..dd2a97c7bd82794b95776d6a817a2d6a5ae84270 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,110 @@ +FROM ubuntu:20.04 + +ARG ROCMVERSION=5.3 +ARG compiler_version="release" +ARG compiler_commit="" + +RUN set -xe + +ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ +# Add rocm repository +RUN apt-get update +RUN apt-get install -y wget gnupg +RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - +RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" +RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - +RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" + +# Install dependencies +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + apt-utils \ + build-essential \ + ccache \ + cmake-data \ + cmake \ + curl \ + git \ + hip-rocclr \ + jq \ + libelf-dev \ + libncurses5-dev \ + libnuma-dev \ + libpthread-stubs0-dev \ + llvm-amdgpu \ + pkg-config \ + python \ + python3 \ + python-dev \ + python3-dev \ + python3-pip \ + software-properties-common \ + rocm-dev \ + rocm-device-libs \ + rocm-cmake \ + vim \ + zlib1g-dev \ + openssh-server \ + clang-format-10 \ + kmod && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Setup ubsan environment to printstacktrace +RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer +ENV UBSAN_OPTIONS=print_stacktrace=1 + +# Install an init system +RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb +RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb + +ARG PREFIX=/opt/rocm +# Install packages for processing the performance results +RUN pip3 install --upgrade pip +RUN pip3 install sqlalchemy==1.4.46 +RUN pip3 install pymysql +RUN pip3 install pandas +RUN pip3 install setuptools-rust +RUN pip3 install sshtunnel +# Setup ubsan environment to printstacktrace +ENV UBSAN_OPTIONS=print_stacktrace=1 + +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 +RUN groupadd -f render + +# Install the new rocm-cmake version +RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git && \ + cd rocm-cmake && mkdir build && cd build && \ + cmake .. && cmake --build . && cmake --build . --target install + +WORKDIR / + +ENV compiler_version=$compiler_version +ENV compiler_commit=$compiler_commit +RUN sh -c "echo compiler version = '$compiler_version'" +RUN sh -c "echo compiler commit = '$compiler_commit'" + +RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \ + sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \ + sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \ + fi + +RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \ + git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ + cd llvm-project && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ + make -j 8 ; \ + else echo "using the release compiler"; \ + fi + +RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \ + git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ + cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ + make -j 8 ; \ + else echo "using the release compiler"; \ + fi + + +#ENV HIP_CLANG_PATH='/llvm-project/build/bin' +#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'" diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000000000000000000000000000000000000..6b255ce13ceabeba991d2ce8bdfd74c9ee47d40a --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,719 @@ +def rocmnode(name) { + return 'rocmtest && miopen && ' + name +} + +def show_node_info() { + sh """ + echo "NODE_NAME = \$NODE_NAME" + lsb_release -sd + uname -r + ls /opt/ -la + """ +} + +def runShell(String command){ + def responseCode = sh returnStatus: true, script: "${command} > tmp.txt" + def output = readFile(file: "tmp.txt") + echo "tmp.txt contents: $output" + return (output != "") +} + +def getDockerImageName(){ + def img + if (params.COMPILER_COMMIT == ""){ + img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" + } + else{ + def commit = "${params.COMPILER_COMMIT}"[0..6] + img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" + } + return img +} + +def check_host() { + if ("${env.CK_CCACHE}" != "null"){ + def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}" + echo "ccache server: ${CCACHE_SERVER}" + sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt''' + def output = readFile(file: "tmp.txt") + echo "tmp.txt contents: \$output" + return (output != "0") + } + else{ + return 1 + } +} + +def build_compiler(){ + def compiler + if (params.BUILD_COMPILER == "hipcc"){ + compiler = '/opt/rocm/bin/hipcc' + } + else{ + if (params.COMPILER_VERSION == "release"){ + compiler = "/opt/rocm/llvm/bin/clang++" + } + else{ + compiler = "/llvm-project/build/bin/clang++" + } + } + return compiler +} + +def getDockerImage(Map conf=[:]){ + env.DOCKER_BUILDKIT=1 + def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm + def no_cache = conf.get("no_cache", false) + def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " + echo "ccache server: ${env.CK_CCACHE}" + if(env.CK_CCACHE) + { + if(check_host()) + { + echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}" + } + else + { + echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response" + } + dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' " + env.CCACHE_DIR = """/tmp/ccache_store""" + env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}""" + } + if(no_cache) + { + dockerArgs = dockerArgs + " --no-cache " + } + echo "Docker Args: ${dockerArgs}" + def image = getDockerImageName() + //Check if image exists + def retimage + try + { + echo "Pulling down image: ${image}" + retimage = docker.image("${image}") + retimage.pull() + } + catch(Exception ex) + { + error "Unable to locate image: ${image}" + } + return [retimage, image] +} + +def buildDocker(install_prefix){ + show_node_info() + env.DOCKER_BUILDKIT=1 + checkout scm + def image_name = getDockerImageName() + echo "Building Docker for ${image_name}" + def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " + echo "ccache server: ${env.CK_CCACHE}" + if(env.CK_CCACHE) + { + if(check_host()) + { + echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}" + } + else + { + echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response" + } + dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' " + env.CCACHE_DIR = """/tmp/ccache_store""" + env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}""" + } + + echo "Build Args: ${dockerArgs}" + try{ + if(params.BUILD_DOCKER){ + //force building the new docker if that parameter is true + echo "Building image: ${image_name}" + retimage = docker.build("${image_name}", dockerArgs + ' .') + retimage.push() + } + else{ + echo "Checking for image: ${image_name}" + sh "docker manifest inspect --insecure ${image_name}" + echo "Image: ${image_name} found!! Skipping building image" + } + } + catch(Exception ex){ + echo "Unable to locate image: ${image_name}. Building image now" + retimage = docker.build("${image_name}", dockerArgs + ' .') + retimage.push() + } +} + +def cmake_build(Map conf=[:]){ + + def compiler = build_compiler() + def config_targets = conf.get("config_targets","check") + def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "") + def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","") + def prefixpath = conf.get("prefixpath","/opt/rocm") + def setup_args = conf.get("setup_args","") + + if (prefixpath != "/usr/local"){ + setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} " + } + + def build_type_debug = (conf.get("build_type",'release') == 'debug') + + //cmake_env can overwrite default CXX variables. + def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","") + + def package_build = (conf.get("package_build","") == "true") + + if (package_build == true) { + config_targets = "package" + } + + if(conf.get("build_install","") == "true") + { + config_targets = 'install ' + config_targets + setup_args = ' -DBUILD_DEV=On -DCMAKE_INSTALL_PREFIX=../install' + setup_args + } else{ + setup_args = ' -DBUILD_DEV=On' + setup_args + } + + if(build_type_debug){ + setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args + }else{ + setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args + } + if(env.CK_CCACHE) + { + setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args + } + echo "ccache server: ${env.CK_CCACHE}" + + def pre_setup_cmd = """ + echo \$HSA_ENABLE_SDMA + ulimit -c unlimited + rm -rf build + mkdir build + rm -rf install + mkdir install + cd build + """ + def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") + // reduce parallelism when compiling, clang uses too much memory + def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 2 )) ${config_targets}") + def execute_cmd = conf.get("execute_cmd", "") + + def cmd = conf.get("cmd", """ + ${pre_setup_cmd} + ${setup_cmd} + ${build_cmd} + ${execute_cmd} + """) + + echo cmd + sh cmd + + // Only archive from master or develop + if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) { + archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true + } +} + +def buildHipClangJob(Map conf=[:]){ + show_node_info() + + env.HSA_ENABLE_SDMA=0 + checkout scm + + def image = getDockerImageName() + def prefixpath = conf.get("prefixpath", "/opt/rocm") + + // Jenkins is complaining about the render group + def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1 " + } + def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " + if (params.COMPILER_VERSION != "release"){ + dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " + } + + def variant = env.STAGE_NAME + + def retimage + (retimage, image) = getDockerImage(conf) + + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 5, unit: 'HOURS') + { + cmake_build(conf) + } + } + } + return retimage +} + +def reboot(){ + build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),] +} + +def buildHipClangJobAndReboot(Map conf=[:]){ + try{ + buildHipClangJob(conf) + } + catch(e){ + echo "throwing error exception for the stage" + echo 'Exception occurred: ' + e.toString() + throw e + } + finally{ + if (!conf.get("no_reboot", false)) { + reboot() + } + } +} + +def runCKProfiler(Map conf=[:]){ + show_node_info() + + env.HSA_ENABLE_SDMA=0 + checkout scm + + def image = getDockerImageName() + def prefixpath = conf.get("prefixpath", "/opt/rocm") + + // Jenkins is complaining about the render group + def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1 " + } + def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " + if (params.COMPILER_VERSION != "release"){ + dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " + } + + def variant = env.STAGE_NAME + def retimage + + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + try { + (retimage, image) = getDockerImage(conf) + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES'){ + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + throw new Exception ("GPU not found") + } + else{ + echo "GPU is OK" + } + } + } + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + catch(Exception ex) { + retimage = docker.build("${image}", dockerArgs + " --no-cache .") + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES'){ + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + throw new Exception ("GPU not found") + } + else{ + echo "GPU is OK" + } + } + } + } + + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 24, unit: 'HOURS') + { + //cmake_build(conf) + //instead of building, just unstash the ckProfiler and install it + sh """ + rm -rf build + mkdir build + """ + dir("build"){ + unstash 'ckProfiler.tar.gz' + sh 'tar -xvf ckProfiler.tar.gz' + } + + dir("script"){ + if (params.RUN_FULL_QA){ + sh "./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + archiveArtifacts "perf_gemm.log" + archiveArtifacts "perf_resnet50_N256.log" + archiveArtifacts "perf_resnet50_N4.log" + archiveArtifacts "perf_batched_gemm.log" + archiveArtifacts "perf_grouped_gemm.log" + archiveArtifacts "perf_conv_fwd.log" + archiveArtifacts "perf_conv_bwd_data.log" + archiveArtifacts "perf_gemm_bilinear.log" + archiveArtifacts "perf_reduction.log" + archiveArtifacts "perf_splitK_gemm_verify.log" + archiveArtifacts "perf_splitK_gemm.log" + archiveArtifacts "perf_onnx_gemm.log" + // stash perf files to master + stash name: "perf_gemm.log" + stash name: "perf_resnet50_N256.log" + stash name: "perf_resnet50_N4.log" + stash name: "perf_batched_gemm.log" + stash name: "perf_grouped_gemm.log" + stash name: "perf_conv_fwd.log" + stash name: "perf_conv_bwd_data.log" + stash name: "perf_gemm_bilinear.log" + stash name: "perf_reduction.log" + stash name: "perf_splitK_gemm.log" + stash name: "perf_onnx_gemm.log" + //we will process results on the master node + } + else{ + sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + archiveArtifacts "perf_gemm.log" + archiveArtifacts "perf_resnet50_N256.log" + archiveArtifacts "perf_resnet50_N4.log" + // stash perf files to master + stash name: "perf_gemm.log" + stash name: "perf_resnet50_N256.log" + stash name: "perf_resnet50_N4.log" + //we will process the results on the master node + } + } + } + } + } + return retimage +} + +def runPerfTest(Map conf=[:]){ + try{ + runCKProfiler(conf) + } + catch(e){ + echo "throwing error exception in performance tests" + echo 'Exception occurred: ' + e.toString() + throw e + } + finally{ + if (!conf.get("no_reboot", false)) { + reboot() + } + } +} + +def Build_CK(Map conf=[:]){ + show_node_info() + + env.HSA_ENABLE_SDMA=0 + checkout scm + + def image = getDockerImageName() + def prefixpath = conf.get("prefixpath", "/opt/rocm") + + // Jenkins is complaining about the render group + def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1 " + } + def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " + if (params.COMPILER_VERSION != "release"){ + dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " + } + + def variant = env.STAGE_NAME + def retimage + + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + try { + (retimage, image) = getDockerImage(conf) + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES'){ + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + throw new Exception ("GPU not found") + } + else{ + echo "GPU is OK" + } + } + } + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + catch(Exception ex) { + retimage = docker.build("${image}", dockerArgs + " --no-cache .") + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES'){ + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + throw new Exception ("GPU not found") + } + else{ + echo "GPU is OK" + } + } + } + } + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 24, unit: 'HOURS') + { + cmake_build(conf) + dir("build"){ + //run tests and examples + sh 'make -j check' + //we only need the ckProfiler to run the performance tests, so we pack and stash it + sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler' + stash "ckProfiler.tar.gz" + if (params.RUN_FULL_QA){ + // build deb packages + sh 'make -j package' + archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb' + archiveArtifacts artifacts: 'composablekernel-tests_*.deb' + } + } + } + } + } + return retimage +} + +def Build_CK_and_Reboot(Map conf=[:]){ + try{ + Build_CK(conf) + } + catch(e){ + echo "throwing error exception while building CK" + echo 'Exception occurred: ' + e.toString() + throw e + } + finally{ + if (!conf.get("no_reboot", false)) { + reboot() + } + } +} + +def process_results(Map conf=[:]){ + env.HSA_ENABLE_SDMA=0 + checkout scm + def image = getDockerImageName() + def prefixpath = "/opt/rocm" + + // Jenkins is complaining about the render group + def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1 " + } + + def variant = env.STAGE_NAME + def retimage + + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + try { + (retimage, image) = getDockerImage(conf) + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + } + + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 1, unit: 'HOURS'){ + try{ + dir("script"){ + if (params.RUN_FULL_QA){ + // unstash perf files to master + unstash "perf_gemm.log" + unstash "perf_resnet50_N256.log" + unstash "perf_resnet50_N4.log" + unstash "perf_batched_gemm.log" + unstash "perf_grouped_gemm.log" + unstash "perf_conv_fwd.log" + unstash "perf_conv_bwd_data.log" + unstash "perf_gemm_bilinear.log" + unstash "perf_reduction.log" + unstash "perf_splitK_gemm.log" + unstash "perf_onnx_gemm.log" + sh "./process_qa_data.sh" + } + else{ + // unstash perf files to master + unstash "perf_gemm.log" + unstash "perf_resnet50_N256.log" + unstash "perf_resnet50_N4.log" + sh "./process_perf_data.sh" + } + } + } + catch(e){ + echo "throwing error exception while processing performance test results" + echo 'Exception occurred: ' + e.toString() + throw e + } + } + } +} + +//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true + 0 21 * * * % RUN_FULL_QA=false;COMPILER_VERSION=release;COMPILER_COMMIT= + 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : "" + +pipeline { + agent none + triggers { + parameterizedCron(CRON_SETTINGS) + } + options { + parallelsAlwaysFailFast() + } + parameters { + booleanParam( + name: "BUILD_DOCKER", + defaultValue: false, + description: "Force building docker image (default: false), set to true if docker image needs to be updated.") + string( + name: 'ROCMVERSION', + defaultValue: '5.4.3', + description: 'Specify which ROCM version to use: 5.4.3 (default).') + string( + name: 'COMPILER_VERSION', + defaultValue: 'amd-stg-open', + description: 'Specify which version of compiler to use: ck-9110, release, or amd-stg-open (default).') + string( + name: 'COMPILER_COMMIT', + defaultValue: '5541927df00eabd6a110180170eca7785d436ee3', + description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit, or use 5541927df00eabd6a110180170eca7785d436ee3 (default) commit of amd-stg-open branch.') + string( + name: 'BUILD_COMPILER', + defaultValue: 'hipcc', + description: 'Specify whether to build CK with hipcc (default) or with clang.') + booleanParam( + name: "RUN_FULL_QA", + defaultValue: false, + description: "Select whether to run small set of performance tests (default) or full QA") + } + environment{ + dbuser = "${dbuser}" + dbpassword = "${dbpassword}" + dbsship = "${dbsship}" + dbsshport = "${dbsshport}" + dbsshuser = "${dbsshuser}" + dbsshpassword = "${dbsshpassword}" + status_wrapper_creds = "${status_wrapper_creds}" + gerrit_cred="${gerrit_cred}" + DOCKER_BUILDKIT = "1" + } + stages{ + stage("Build Docker"){ + //when { + // beforeAgent true + // expression { params.BUILD_DOCKER.toBoolean() } + //} + parallel{ + stage('Docker /opt/rocm'){ + agent{ label rocmnode("nogpu") } + steps{ + buildDocker('/opt/rocm') + } + } + } + } + stage("Static checks") { + parallel{ + stage('Clang Format') { + agent{ label rocmnode("nogpu") } + environment{ + execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \ + -o -not -path \'*.git*\' -iname \'*.hpp\' \ + -o -not -path \'*.git*\' -iname \'*.cpp\' \ + -o -iname \'*.h.in\' \ + -o -iname \'*.hpp.in\' \ + -o -iname \'*.cpp.in\' \ + -o -iname \'*.cl\' \ + | grep -v 'build/' \ + | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-10 -style=file {} | diff - {}\'" + } + steps{ + buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true) + } + } + } + } + + stage("Build CK and run Tests") + { + parallel + { + stage("Build CK and run Tests") + { + agent{ label rocmnode("gfx908 || gfx90a") } + environment{ + setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" """ : """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS="-O3 " """ }" + execute_args = "${params.COMPILER_VERSION == "ck-9110" ? """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908,gfx90a;gfx1030" -DCMAKE_CXX_FLAGS="-O3" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ }" + } + steps{ + Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + } + } + } + } + + stage("Performance Tests") + { + parallel + { + stage("Run ckProfiler: gfx908 or gfx90a") + { + when { + beforeAgent true + expression { !params.RUN_FULL_QA.toBoolean() } + } + options { retry(2) } + agent{ label rocmnode("gfx908 || gfx90a")} + environment{ + setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx908;gfx90a;gfx1030" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}" + } + steps{ + runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') + } + } + stage("Run ckProfiler: gfx90a") + { + when { + beforeAgent true + expression { params.RUN_FULL_QA.toBoolean() } + } + options { retry(2) } + agent{ label rocmnode("gfx90a")} + environment{ + setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}" + } + steps{ + runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') + } + } + } + } + stage("Process Performance Test Results") + { + parallel + { + stage("Process results"){ + agent { label 'mici' } + steps{ + process_results() + } + } + } + } + } +} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..2fe9a8455efaeda2eab474b2aa038ec2d9e76841 --- /dev/null +++ b/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2018- , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang) +Copyright (c) 2019- , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang) +Copyright (c) 2022- , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan) +Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang) +Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah) +Copyright (c) 2020 , Advanced Micro Devices, Inc. (Xiaoyan Zhou) +Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan) + +SPDX-License-Identifier: MIT +Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 4f071d5896c27aea55f00d31fe338ff94473b794..151da974a559bef2e52971034f607d4afe44f632 100644 --- a/README.md +++ b/README.md @@ -1,177 +1,96 @@ -# How to build and run +# Composable Kernel -# Docker -``` -docker run \ --it \ ---rm \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/tensorflow:rocm4.2-tf2.4-dev \ -/bin/bash -``` +## Methodology +Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++. -# Install Boost for online compilation -https://www.boost.org/doc/libs/1_66_0/more/getting_started/unix-variants.html#easy-build-and-install +CK utilizes two concepts to achieve performance portability and code maintainability: +* A tile-based programming model +* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation". +![ALT](/doc/image/ck_component.png "CK Components") -# Build -Add path of Boost -``` - export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH +## Code Structure +Current CK library are structured into 4 layers: +* "Templated Tile Operators" layer +* "Templated Kernel and Invoker" layer +* "Instantiated Kernel and Invoker" layer +* "Client API" layer + +![ALT](/doc/image/ck_layer.png "CK Layers") + +## Contributors +The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md) + +## Citation +If you use CK, please use following citations: +* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???) +* [CITATION.cff](/CITATION.cff) + +## License +CK is released under the MIT license. [License File](/LICENSE) + + +# Build CK + +## Build docker image +```bash +DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile . ``` +## Launch docker +```bash +docker run \ +-it \ +--privileged \ +--group-add sudo \ +-w /root/workspace \ +-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ +ck:latest \ +/bin/bash ``` + +## Build CK +```bash mkdir build && cd build -``` -cmake cmd. Need to Specify target ID, example below is gfx908 -``` -cmake \ --D CMAKE_BUILD_TYPE=Release \ --D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 -O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \ --D HIP_ONLINE_COMPILER_FLAGS="-DCK_AMD_GPU_GFX908" \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_PREFIX_PATH=/opt/rocm \ --D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ +# Need to specify target ID, example below is for gfx908 and gfx90a +cmake \ +-D CMAKE_PREFIX_PATH=/opt/rocm \ +-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ +-D CMAKE_CXX_FLAGS="-O3" \ +-D CMAKE_BUILD_TYPE=Release \ +-D GPU_TARGETS="gfx908;gfx90a" \ .. ``` -Build drivers: \ -``conv_fwd_driver_offline`` is (offline compilation) driver for forward convolution, \ -``conv_bwd_driver_offline`` is (offline compilation) driver for backward-data convolution \ -``conv_fwd_driver_online`` is (online compilation) driver for forward convolution -``` - make -j conv_fwd_driver_offline - make -j conv_bwd_driver_offline - make -j conv_fwd_driver_online +### Build examples and tests +```bash + make -j examples tests + make test ``` -# Run -* layout: 0 = NCHW; 1 = NHWC -* algo: algorithm -* verify: 0 = no verification; 1 = do verification -* init: 0 ~ 5. initialization method -* log: 0 = no log; 1 = do log -* repeat: number of time kernel being launched -``` -######################################################## layout algo verify init log repeat N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads - ./host/driver_offline/conv_fwd_driver_offline 0 4 0 0 0 1 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1 - ./host/driver_offline/conv_fwd_driver_offline 0 4 0 0 0 1 256 1024 256 3 3 14 14 1 1 1 1 1 1 1 1 - ./host/driver_offline/conv_fwd_driver_offline 1 5 0 0 0 1 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1 - ./host/driver_offline/conv_fwd_driver_offline 1 5 0 0 0 1 256 1024 256 3 3 14 14 1 1 1 1 1 1 1 1 - ./host/driver_offline/conv_bwd_driver_offline 1 5 0 0 0 1 256 256 1024 3 3 14 14 1 1 1 1 1 1 1 1 -``` +Instructions for running each individual examples are under [example](/example) -# Result -Forward convoltuion, FP16, NCHW -``` -./host/driver_offline/conv_fwd_driver_offline 0 4 0 0 0 1 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1 - -layout: 0 -in: dim 4, lengths {128, 192, 71, 71}, strides {967872, 5041, 71, 1} -wei: dim 4, lengths {256, 192, 3, 3}, strides {1728, 9, 3, 1} -out: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1296, 36, 1} -InLeftPads size 2, {1, 1, } -InRightPads size 2, {1, 1, } -ConvStrides size 2, {2, 2, } -ConvDilations size 2, {1, 1, } -device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw -a_k0_m_k1_grid_desc{216, 256, 8} -b_k0_n_k1_grid_desc{216, 165888, 8} -c_m_n_grid_desc{ 256, 165888} -launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 1 times... -Average time : 1.4155 ms, 103.686 TFlop/s -``` -Forward convoltuion, FP16, NCHW +## Build ckProfiler +```bash + make -j ckProfiler ``` - ./host/driver_offline/conv_fwd_driver_offline 0 4 0 0 0 1 256 1024 256 3 3 14 14 1 1 1 1 1 1 1 1 - - layout: 0 -in: dim 4, lengths {256, 256, 14, 14}, strides {50176, 196, 14, 1} -wei: dim 4, lengths {1024, 256, 3, 3}, strides {2304, 9, 3, 1} -out: dim 4, lengths {256, 1024, 14, 14}, strides {200704, 196, 14, 1} -InLeftPads size 2, {1, 1, } -InRightPads size 2, {1, 1, } -ConvStrides size 2, {1, 1, } -ConvDilations size 2, {1, 1, } -device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw -a_k0_m_k1_grid_desc{288, 1024, 8} -b_k0_n_k1_grid_desc{288, 50176, 8} -c_m_n_grid_desc{ 1024, 50176} -launch_and_time_kernel: grid_dim {1568, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 1 times... -Average time : 2.21357 ms, 106.959 TFlop/s - ``` - - Forward convolution, FP16, NHWC - ``` - ./host/driver_offline/conv_fwd_driver_offline 1 5 0 0 0 1 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1 - - layout: 1 -in: dim 4, lengths {128, 71, 71, 192}, strides {967872, 13632, 192, 1} -wei: dim 4, lengths {256, 3, 3, 192}, strides {1728, 576, 192, 1} -out: dim 4, lengths {128, 36, 36, 256}, strides {331776, 9216, 256, 1} -InLeftPads size 2, {1, 1, } -InRightPads size 2, {1, 1, } -ConvStrides size 2, {2, 2, } -ConvDilations size 2, {1, 1, } -device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk -a_k0_m_k1_grid_desc{216, 165888, 8} -b_k0_n_k1_grid_desc{216, 256, 8} -c_m_n_grid_desc{ 165888, 256} -launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 1 times... -Average time : 1.12014 ms, 131.025 TFlop/s - ``` - - Forward convolution, FP16, NHWC - ``` - ./host/driver_offline/conv_fwd_driver_offline 1 5 0 0 0 1 256 1024 256 3 3 14 14 1 1 1 1 1 1 1 1 - - layout: 1 -in: dim 4, lengths {256, 14, 14, 256}, strides {50176, 3584, 256, 1} -wei: dim 4, lengths {1024, 3, 3, 256}, strides {2304, 768, 256, 1} -out: dim 4, lengths {256, 14, 14, 1024}, strides {200704, 14336, 1024, 1} -InLeftPads size 2, {1, 1, } -InRightPads size 2, {1, 1, } -ConvStrides size 2, {1, 1, } -ConvDilations size 2, {1, 1, } -device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk -a_k0_m_k1_grid_desc{288, 50176, 8} -b_k0_n_k1_grid_desc{288, 1024, 8} -c_m_n_grid_desc{ 50176, 1024} -launch_and_time_kernel: grid_dim {1568, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 1 times... -Average time : 1.86877 ms, 126.693 TFlop/s - ``` - - Backward data convolution, FP16, NHWC - ``` - ./host/driver_offline/conv_bwd_driver_offline 1 1 0 3 0 1 256 256 1024 3 3 14 14 1 1 1 1 1 1 1 1 - - layout: 1 -in: dim 4, lengths {256, 14, 14, 1024}, strides {200704, 14336, 1024, 1} -wei: dim 4, lengths {256, 3, 3, 1024}, strides {9216, 3072, 1024, 1} -out: dim 4, lengths {256, 14, 14, 256}, strides {50176, 3584, 256, 1} -InLeftPads size 2, {1, 1, } -InRightPads size 2, {1, 1, } -ConvStrides size 2, {1, 1, } -ConvDilations size 2, {1, 1, } -device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk -a_k0_m_k1_grid_desc{288, 50176, 8} -b_k0_n_k1_grid_desc{288, 1024, 8} -c_m_n_grid_desc{ 50176, 1024} -launch_and_time_kernel: grid_dim {1568, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 1 times... -Average time : 2.22461 ms, 106.428 TFlop/s +Instructions for running ckProfiler are under [profiler](/profiler) + +## Install CK +```bash +make install ``` + +## Using CK as pre-built kernel library +Instructions for using CK as a pre-built kernel library are under [client_example](/client_example) + +## Caveat +### Kernel Timing and Verification +CK's own kernel timer will warn up kernel once, and then run it multiple times +to get average kernel time. For some kernels that use atomic add, this will cause +output buffer to be accumulated multiple times, causing verification failure. +To work around it, do not use CK's own timer and do verification at the same time. +CK's own timer and verification in each example and ckProfiler can be enabled or +disabled from command line. diff --git a/client_example/01_gemm/CMakeLists.txt b/client_example/01_gemm/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e741192f90b8216e4b3abe32ae8971fb45ddfee --- /dev/null +++ b/client_example/01_gemm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_gemm gemm.cpp) +target_link_libraries(client_gemm PRIVATE composable_kernel::device_operations) diff --git a/client_example/01_gemm/gemm.cpp b/client_example/01_gemm/gemm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ba7118ba3929e3e3bcdf02e40044748860bfeebe --- /dev/null +++ b/client_example/01_gemm/gemm.cpp @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +using ADataType = F16; +using BDataType = F16; +using CDataType = F16; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; + + if(argc == 1) + { + // use default case + } + else if(argc == 7) + { + M = std::stoi(argv[1]); + N = std::stoi(argv[2]); + K = std::stoi(argv[3]); + + StrideA = std::stoi(argv[4]); + StrideB = std::stoi(argv[5]); + StrideC = std::stoi(argv[6]); + } + else + { + printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n"); + exit(0); + } + + auto f_matrix_space_size = + [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) { + using Layout = decltype(layout); + + if constexpr(std::is_same::value) + { + return (nRow - 1) * stride + nCol; + } + else + { + return (nCol - 1) * stride + nRow; + } + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); + SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{})); + + using DeviceOp = + ck::tensor_operation::device::DeviceGemm; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto c_element_op = CElementOp{}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + c_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + c_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b724ccc484d6c9ea83e791aa5b8fcde420eef7 --- /dev/null +++ b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt @@ -0,0 +1,13 @@ +add_custom_target(client_gemm_fastgelu_examples) + +add_executable(client_gemm_add_add_fastgelu gemm_add_add_fastgelu.cpp) +target_link_libraries(client_gemm_add_add_fastgelu PRIVATE composable_kernel::device_operations) + +add_executable(client_gemm_add_fastgelu gemm_add_fastgelu.cpp) +target_link_libraries(client_gemm_add_fastgelu PRIVATE composable_kernel::device_operations) + +add_executable(client_gemm_fastgelu gemm_fastgelu.cpp) +target_link_libraries(client_gemm_fastgelu PRIVATE composable_kernel::device_operations) + +add_dependencies(client_gemm_fastgelu_examples client_gemm_add_add_fastgelu client_gemm_add_fastgelu + client_gemm_fastgelu) diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..08f297f58a8d522aec8c991a8d38484ccf8e420c --- /dev/null +++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = AddAddFastGelu; + +using ADataType = F16; +using BDataType = F16; +using D0DataType = F16; +using D1DataType = F16; +using EDataType = F16; + +using ALayout = Row; +using BLayout = Col; +using D0Layout = Row; +using D1Layout = Row; +using ELayout = Row; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideD0 = 0; + ck::index_t StrideD1 = 4096; + ck::index_t StrideE = 4096; + + if(argc == 1) + { + // use default case + } + else if(argc == 9) + { + M = std::stoi(argv[1]); + N = std::stoi(argv[2]); + K = std::stoi(argv[3]); + + StrideA = std::stoi(argv[4]); + StrideB = std::stoi(argv[5]); + StrideD0 = std::stoi(argv[6]); + StrideD1 = std::stoi(argv[7]); + StrideE = std::stoi(argv[8]); + } + else + { + printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n"); + exit(0); + } + + auto f_matrix_space_size = + [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) { + using Layout = decltype(layout); + + if constexpr(std::is_same::value) + { + return (nRow - 1) * stride + nCol; + } + else + { + return (nCol - 1) * stride + nRow; + } + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); + SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) * + f_matrix_space_size(M, N, StrideD0, D0Layout{})); + SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) * + f_matrix_space_size(M, N, StrideD1, D1Layout{})); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{})); + + using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD< + ALayout, + BLayout, + ck::Tuple, + ELayout, + ADataType, + BDataType, + ck::Tuple, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::AddAddFastGelu>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d0_m_n_device_buf.GetDeviceBuffer(), + d1_m_n_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + std::array{StrideD0, StrideD1}, + StrideE, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d0_m_n_device_buf.GetDeviceBuffer(), + d1_m_n_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + std::array{StrideD0, StrideD1}, + StrideE, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..658c1e9e8fcbeab1ec7be4115d5a7d62c5e77283 --- /dev/null +++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = AddFastGelu; + +using ADataType = F16; +using BDataType = F16; +using D0DataType = F16; +using EDataType = F16; + +using ALayout = Row; +using BLayout = Col; +using D0Layout = Row; +using ELayout = Row; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideD0 = 0; + ck::index_t StrideE = 4096; + + if(argc == 1) + { + // use default case + } + else if(argc == 8) + { + M = std::stoi(argv[1]); + N = std::stoi(argv[2]); + K = std::stoi(argv[3]); + + StrideA = std::stoi(argv[4]); + StrideB = std::stoi(argv[5]); + StrideD0 = std::stoi(argv[6]); + StrideE = std::stoi(argv[8]); + } + else + { + printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE\n"); + exit(0); + } + + auto f_matrix_space_size = + [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) { + using Layout = decltype(layout); + + if constexpr(std::is_same::value) + { + return (nRow - 1) * stride + nCol; + } + else + { + return (nCol - 1) * stride + nRow; + } + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); + SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) * + f_matrix_space_size(M, N, StrideD0, D0Layout{})); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{})); + + using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD< + ALayout, + BLayout, + ck::Tuple, + ELayout, + ADataType, + BDataType, + ck::Tuple, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::AddFastGelu>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d0_m_n_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + std::array{StrideD0}, + StrideE, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d0_m_n_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + std::array{StrideD0}, + StrideE, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ea269545a5cc576f7cf98ea8496acf713b6aaea6 --- /dev/null +++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using FastGelu = ck::tensor_operation::element_wise::FastGelu; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = FastGelu; + +using ADataType = F16; +using BDataType = F16; +using EDataType = F16; + +using ALayout = Row; +using BLayout = Col; +using ELayout = Row; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideE = 4096; + + if(argc == 1) + { + // use default case + } + else if(argc == 7) + { + M = std::stoi(argv[1]); + N = std::stoi(argv[2]); + K = std::stoi(argv[3]); + + StrideA = std::stoi(argv[4]); + StrideB = std::stoi(argv[5]); + StrideE = std::stoi(argv[8]); + } + else + { + printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideE\n"); + exit(0); + } + + auto f_matrix_space_size = + [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) { + using Layout = decltype(layout); + + if constexpr(std::is_same::value) + { + return (nRow - 1) * stride + nCol; + } + else + { + return (nCol - 1) * stride + nRow; + } + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{})); + + using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD< + ALayout, + BLayout, + ck::Tuple<>, + ELayout, + ADataType, + BDataType, + ck::Tuple<>, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::FastGelu>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + {}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + {}, + StrideE, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + {}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + {}, + StrideE, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/03_gemm_layernorm/CMakeLists.txt b/client_example/03_gemm_layernorm/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b38698d9064e6fa3cbd2e01c8818c0ebfe361cb3 --- /dev/null +++ b/client_example/03_gemm_layernorm/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(client_gemm_add_add_layernorm_naive gemm_add_add_layernorm_naive.cpp) +target_link_libraries(client_gemm_add_add_layernorm_naive PRIVATE composable_kernel::device_operations) + +add_executable(client_gemm_add_relu_add_layernorm_welford gemm_add_relu_add_layernorm_welford.cpp) +target_link_libraries(client_gemm_add_relu_add_layernorm_welford PRIVATE composable_kernel::device_operations) diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp new file mode 100644 index 0000000000000000000000000000000000000000..caa6573788d201b9fda605e5388423d5966d41ec --- /dev/null +++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm_naive.cpp @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using ADataType = F16; +using BDataType = F16; +using BiasDataType = F32; +using CDataType = F16; +using D0DataType = F16; +using ReduceDataType = F32; +using GammaDataType = F16; +using BetaDataType = F16; +using LayerNormOutDataType = F16; + +using ALayout = ck::tensor_layout::gemm::RowMajor; +using BLayout = ck::tensor_layout::gemm::ColumnMajor; +using CLayout = ck::tensor_layout::gemm::RowMajor; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +template +bool RunDeviceGemmMeanSquareMean(gemm_reduce_op_ptr& p_op, + const void* p_a, + const void* p_b, + const void* p_bias, + const void* p_d0, + void* p_c, + void* p_mean, + void* p_square_mean, + int M, + int N, + int K, + int StrideA, + int StrideB, + int StrideC, + int StrideD0, + bool time_kernel) +{ + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide; + using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; + + auto passOp = PassThrough{}; + auto squareOp = UnarySquareElementOp{}; + auto divOp = UnaryDivElementOp{N}; + + auto argument_ptr = + p_op->MakeArgumentPointer(p_a, + p_b, + p_bias, + {p_d0}, + p_c, + {p_mean, p_square_mean}, + M, + N, + K, + StrideA, + StrideB, + StrideC, + {StrideD0}, + {&passOp, &passOp, &passOp}, // functor for a, b, c + {&passOp}, // functor for d0 + {&passOp, &squareOp}, // functor for inputs of reduction + {&divOp, &divOp}); // functor for outputs of reduction + + if(p_op->IsSupportedArgument(argument_ptr.get())) + { + auto invoker_ptr = p_op->MakeInvokerPointer(); + + // If we evaluate running time of gemm_reduce. The output may wrong. + // Because we need to initialize the reduction tensor before runing the kernel. + // However we run kernel many times for time_kernel = trie without reinitialize the out + // of reduction tensor. + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + if(time_kernel) + std::cout << "Gemm + reduce Perf: " << std::setw(10) << ave_time << " ms" << std::endl; + + return true; + } + + return false; +} + +template +bool RunDeviceNormalize2D(normalize_op_ptr& p_op, + const void* p_x, + const void* p_mean, + const void* p_square_mean, + const void* p_gamma, + const void* p_beta, + void* p_y, + int M, + int N, + int StrideX, + bool time_kernel) +{ + std::array input = {p_x, p_mean, p_square_mean, p_gamma, p_beta}; + std::array output = {p_y}; + auto normalize_functor = ck::tensor_operation::element_wise::Normalize{}; + + std::array xyLengths = {M, N}; + std::array xyStrides = {StrideX, 1}; + + auto argument_ptr = p_op->MakeArgumentPointer(xyLengths, + {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}}, + {xyStrides}, + input, + output, + ck::tensor_operation::element_wise::Normalize{}); + + if(p_op->IsSupportedArgument(argument_ptr.get())) + { + auto invoker_ptr = p_op->MakeInvokerPointer(); + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + if(time_kernel) + std::cout << "Normalize Perf: " << std::setw(10) << ave_time << " ms" << std::endl; + + return true; + } + + return false; +} + +int main() +{ + ck::index_t M = 1024; + ck::index_t N = 1024; + ck::index_t K = 1024; + + ck::index_t StrideA = 1024; + ck::index_t StrideB = 1024; + ck::index_t StrideC = 1024; + ck::index_t StrideD0 = 1024; + + const auto gemm_reduce_ptrs = + ck::tensor_operation::device::instance::get_device_gemm_add_add_mean_squaremean_instances< + ADataType, + BDataType, + CDataType, + ALayout, + BLayout, + CLayout>(); + + const auto normalize_ptrs = + ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances< + CDataType, + ReduceDataType, + ReduceDataType, + GammaDataType, + BetaDataType, + LayerNormOutDataType>(); + + std::cout << "found " << gemm_reduce_ptrs.size() + << " gemm_reduceMean_reduceSquareMean instances" << std::endl; + + std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl; + + auto f_matrix_space_size = + [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) { + using Layout = decltype(layout); + + if constexpr(std::is_same::value) + { + return (nRow - 1) * stride + nCol; + } + else + { + return (nCol - 1) * stride + nRow; + } + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); + SimpleDeviceMem bias_device_buf(sizeof(BiasDataType) * N); + SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{})); + SimpleDeviceMem d0_device_buf(sizeof(D0DataType) * + f_matrix_space_size(M, N, StrideD0, CLayout{})); + SimpleDeviceMem reduceMean_device_buf(sizeof(ReduceDataType) * M); + SimpleDeviceMem reduceMeanSquare_device_buf(sizeof(ReduceDataType) * M); + SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N); + SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N); + SimpleDeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) * M * N); + + bool b_time_kernel = true; + bool b_only_run_first_kernel = true; + + // layernorm => (1) + (2) + // (1). c = gemm(a, b), reduce_mean(c), reduce_square_mean(c) + // (2). normalize(c, mean, square_mean, gamma, beta) + for(auto& gemm_reduce_ptr : gemm_reduce_ptrs) + { + // run first available kernel + if(RunDeviceGemmMeanSquareMean(gemm_reduce_ptr, + a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + bias_device_buf.GetDeviceBuffer(), + d0_device_buf.GetDeviceBuffer(), + c_device_buf.GetDeviceBuffer(), + reduceMean_device_buf.GetDeviceBuffer(), + reduceMeanSquare_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + StrideC, + StrideD0, + b_time_kernel)) + { + if(b_only_run_first_kernel) + break; + } + else + { + std::cout << gemm_reduce_ptr->GetTypeString() << " does not support this problem" + << std::endl; + } + } + + for(auto& normalize_ptr : normalize_ptrs) + { + if(RunDeviceNormalize2D(normalize_ptr, + c_device_buf.GetDeviceBuffer(), + reduceMean_device_buf.GetDeviceBuffer(), + reduceMeanSquare_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + layerNorm_device_buf.GetDeviceBuffer(), + M, + N, + StrideC, + b_time_kernel)) + { + if(b_only_run_first_kernel) + break; + } + else + { + std::cout << normalize_ptr->GetTypeString() << " does not support this problem" + << std::endl; + } + } +} diff --git a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d4f0c2048ba81c3a1f98a22c5eb57654b0498806 --- /dev/null +++ b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using F16 = ck::half_t; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using AddReluAdd = ck::tensor_operation::element_wise::AddReluAdd; + +// DataType +using ADataType = F16; +using BDataType = F16; +using D0DataType = F16; +using D1DataType = F16; +using GammaDataType = F16; +using BetaDataType = F16; +using HDataType = F16; + +// Layout +using ALayout = Row; +using BLayout = Col; +using D0Layout = Row; +using D1Layout = Row; +using HLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = AddReluAdd; +using HElementOp = PassThrough; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{}, mMemSize_(mem_size) + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + void SetZero() const { (void)hipMemset(p_mem_, 0, mMemSize_); } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; + std::size_t mMemSize_; +}; + +int main(int argc, char* argv[]) +{ + // GEMM shape + ck::index_t M = 1024; + ck::index_t N = 1024; + ck::index_t K = 1024; + + ck::index_t StrideA = K; + ck::index_t StrideB = K; + ck::index_t StrideD0 = 0; + ck::index_t StrideD1 = N; + ck::index_t StrideH = N; + + float epsilon = 1e-5; + + auto f_matrix_space_size = + [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) { + using Layout = decltype(layout); + + if constexpr(std::is_same::value) + { + return (nRow - 1) * stride + nCol; + } + else + { + return (nCol - 1) * stride + nRow; + } + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); + SimpleDeviceMem d0_device_buf(sizeof(D0DataType) * + f_matrix_space_size(M, N, StrideD0, D0Layout{})); + SimpleDeviceMem d1_device_buf(sizeof(D1DataType) * + f_matrix_space_size(M, N, StrideD1, D1Layout{})); + SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N); + SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N); + SimpleDeviceMem h_device_buf(sizeof(HDataType) * f_matrix_space_size(M, N, StrideH, HLayout{})); + + using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleDLayernorm< + ALayout, + BLayout, + ck::Tuple, + HLayout, + ADataType, + BDataType, + ck::Tuple, + GammaDataType, + BetaDataType, + HDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::AddReluAdd, + ck::tensor_operation::element_wise::PassThrough>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{}; + const auto h_element_op = HElementOp{}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()}, + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + h_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + {StrideD0, StrideD1}, + StrideH, + epsilon, + a_element_op, + b_element_op, + cde_element_op, + h_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + SimpleDeviceMem workspace_dev(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); + h_device_buf.SetZero(); + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_byte = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + + (sizeof(D0DataType) + sizeof(D1DataType) + sizeof(HDataType)) * M * N + + (sizeof(GammaDataType) + sizeof(BetaDataType)) * N; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()}, + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + h_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + {StrideD0, StrideD1}, + StrideH, + epsilon, + a_element_op, + b_element_op, + cde_element_op, + h_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + SimpleDeviceMem workspace_dev(workspace_sz); + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); + h_device_buf.SetZero(); + + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/client_example/04_contraction/CMakeLists.txt b/client_example/04_contraction/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..971d5d9f1c00e471adb4e9a60c0331e602d61468 --- /dev/null +++ b/client_example/04_contraction/CMakeLists.txt @@ -0,0 +1,9 @@ +add_executable(client_contraction_scale contraction_scale.cpp) +target_link_libraries(client_contraction_scale PRIVATE composable_kernel::device_operations) + +add_executable(client_contraction_bilinear contraction_bilinear.cpp) +target_link_libraries(client_contraction_bilinear PRIVATE composable_kernel::device_operations) + +add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp) +target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations) + diff --git a/client_example/04_contraction/contraction_bilinear.cpp b/client_example/04_contraction/contraction_bilinear.cpp new file mode 100644 index 0000000000000000000000000000000000000000..91dead41a4cac19db857b99a233839e9e6647c57 --- /dev/null +++ b/client_example/04_contraction/contraction_bilinear.cpp @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp" +#include "ck/library/utility/numeric.hpp" + +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Bilinear = ck::tensor_operation::element_wise::Bilinear; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = Bilinear; + +using ADataType = F32; +using BDataType = F32; +using AccDataType = F32; +using CShuffleDataType = F32; +using DDataType = F32; +using DsDataType = ck::Tuple; +using EDataType = F32; + +static constexpr ck::index_t NumDimM = 2; +static constexpr ck::index_t NumDimN = 2; +static constexpr ck::index_t NumDimK = 2; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{524288, 4096, 128, 1}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{524288, 4096, 128, 1}; + // D[M0, M1, N0, N1] + std::vector d_ms_ns_lengths{30, 128, 32, 64}; + std::vector d_ms_ns_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; + + float alpha = 1.f; + float beta = 1.f; + + if(argc == 1) + { + // use default case + } + else if(argc == 25) + { + const ck::index_t M0 = std::stoi(argv[1]); + const ck::index_t M1 = std::stoi(argv[2]); + + const ck::index_t N0 = std::stoi(argv[3]); + const ck::index_t N1 = std::stoi(argv[4]); + + const ck::index_t K0 = std::stoi(argv[5]); + const ck::index_t K1 = std::stoi(argv[6]); + + a_ms_ks_lengths = {M0, M1, K0, K1}; + a_ms_ks_strides = { + std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])}; + + b_ns_ks_lengths = {N0, N1, K0, K1}; + b_ns_ks_strides = { + std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])}; + + d_ms_ns_lengths = {M0, M1, N0, N1}; + d_ms_ns_strides = { + std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])}; + + e_ms_ns_lengths = {M0, M1, N0, N1}; + e_ms_ns_strides = { + std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])}; + + alpha = std::stof(argv[23]); + beta = std::stof(argv[24]); + } + else + { + printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n"); + printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n"); + printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n"); + printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n"); + printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n"); + printf("arg23 to 24: alpha, beta\n"); + exit(0); + } + + auto f_tensor_space_size = [](auto lengths, auto strides) { + std::size_t space_size = 1; + for(std::size_t i = 0; i < lengths.size(); ++i) + { + space_size += (lengths[i] - 1) * strides[i]; + } + return space_size; + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * + f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides)); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * + f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides)); + SimpleDeviceMem d_device_buf(sizeof(DDataType) * + f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides)); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * + f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides)); + + using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD< + NumDimM, + NumDimN, + NumDimK, + ADataType, + BDataType, + ck::Tuple, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{alpha, beta}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = + op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{d_ms_ns_lengths}, + std::array, 1>{d_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + ck::index_t M = ck::accumulate_n( + e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{}); + + ck::index_t N = ck::accumulate_n( + e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{}); + + ck::index_t K = ck::accumulate_n( + a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{}); + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + + sizeof(DDataType) * M * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + return 0; +} diff --git a/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..62be3377a2fb18bf388b857ecb758c0b7987871c --- /dev/null +++ b/client_example/04_contraction/contraction_g1m2n3k1_add_xdl_fp16.cpp @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/batched_gemm_bias_permute.hpp" +#include "ck/library/utility/numeric.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Add = ck::tensor_operation::element_wise::Add; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = Add; + +using ADataType = F16; +using BDataType = F16; +using AccDataType = F32; +using CShuffleDataType = F16; +using DDataType = F16; +using DsDataType = ck::Tuple; +using EDataType = F16; + +static constexpr ck::index_t NumDimG = 1; +static constexpr ck::index_t NumDimM = 2; +static constexpr ck::index_t NumDimN = 3; +static constexpr ck::index_t NumDimK = 1; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + ck::index_t G0 = 1; + + ck::index_t M0 = 64; + ck::index_t M1 = 256; + + ck::index_t N0 = 3; + ck::index_t N1 = 12; + ck::index_t N2 = 64; + + ck::index_t K0 = 768; + + // A[M0, M1, M2, K0] + std::vector a_gs_ms_ks_lengths{G0, M0, M1, K0}; + std::vector a_gs_ms_ks_strides{M0 * M1 * K0, M1 * K0, K0, 1}; + // B[N0, N1, N2, K0] + std::vector b_gs_ns_ks_lengths{G0, N0, N1, N2, K0}; + std::vector b_gs_ns_ks_strides{N0 * N1 * N2 * K0, N1 * N2 * K0, N2 * K0, K0, 1}; + + // D[N0, M0, N1, M1, N2] + std::vector d_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2}; + std::vector d_gs_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1}; + // E[N0 M0 N1 N2 M1] + std::vector e_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2}; + std::vector e_gs_ms_ns_strides{ + M0 * M1 * N0 * N1 * N2, N1 * N2 * M1, 1, M0 * N1 * N2 * M1, M1 * N2, M1}; + + auto f_tensor_space_size = [](auto lengths, auto strides) { + std::size_t space_size = 1; + for(std::size_t i = 0; i < lengths.size(); ++i) + { + space_size += (lengths[i] - 1) * strides[i]; + } + return space_size; + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * + f_tensor_space_size(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * + f_tensor_space_size(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)); + SimpleDeviceMem d_device_buf(sizeof(DDataType) * + f_tensor_space_size(d_gs_ms_ns_lengths, d_gs_ms_ns_strides)); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * + f_tensor_space_size(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)); + + using DeviceOp = ck::tensor_operation::device::DeviceBatchedContractionMultipleD< + NumDimG, + NumDimM, + NumDimN, + NumDimK, + ADataType, + BDataType, + DsDataType, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Add>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = + op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b_gs_ns_ks_lengths, + b_gs_ns_ks_strides, + std::array, 1>{d_gs_ms_ns_lengths}, + std::array, 1>{d_gs_ms_ns_strides}, + e_gs_ms_ns_lengths, + e_gs_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + ck::index_t M = ck::accumulate_n( + e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{}); + + ck::index_t N = ck::accumulate_n( + e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{}); + + ck::index_t K = ck::accumulate_n( + a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{}); + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + + sizeof(DDataType) * M * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + return 0; +} diff --git a/client_example/04_contraction/contraction_scale.cpp b/client_example/04_contraction/contraction_scale.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e08ee19cdb098b2dfb70a662d59c87008400123 --- /dev/null +++ b/client_example/04_contraction/contraction_scale.cpp @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp" +#include "ck/library/utility/numeric.hpp" + +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Scale = ck::tensor_operation::element_wise::Scale; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = Scale; + +using ADataType = F32; +using BDataType = F32; +using AccDataType = F32; +using CShuffleDataType = F32; +using DsDataType = ck::Tuple<>; +using EDataType = F32; + +static constexpr ck::index_t NumDimM = 2; +static constexpr ck::index_t NumDimN = 2; +static constexpr ck::index_t NumDimK = 2; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{524288, 4096, 128, 1}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; + + float scale = 1.f; + + if(argc == 1) + { + // use default case + } + else if(argc == 20) + { + const ck::index_t M0 = std::stoi(argv[1]); + const ck::index_t M1 = std::stoi(argv[2]); + + const ck::index_t N0 = std::stoi(argv[3]); + const ck::index_t N1 = std::stoi(argv[4]); + + const ck::index_t K0 = std::stoi(argv[5]); + const ck::index_t K1 = std::stoi(argv[6]); + + a_ms_ks_lengths = {M0, M1, K0, K1}; + a_ms_ks_strides = { + std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])}; + + b_ns_ks_lengths = {N0, N1, K0, K1}; + b_ns_ks_strides = { + std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])}; + + e_ms_ns_lengths = {M0, M1, N0, N1}; + e_ms_ns_strides = { + std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])}; + + scale = std::stof(argv[19]); + } + else + { + printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n"); + printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n"); + printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n"); + printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n"); + printf("arg19: scale\n"); + exit(0); + } + + auto f_tensor_space_size = [](auto lengths, auto strides) { + std::size_t space_size = 1; + for(std::size_t i = 0; i < lengths.size(); ++i) + { + space_size += (lengths[i] - 1) * strides[i]; + } + return space_size; + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * + f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides)); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * + f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides)); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * + f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides)); + + using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD< + NumDimM, + NumDimN, + NumDimK, + ADataType, + BDataType, + ck::Tuple<>, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{scale}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{}, + e_device_buf.GetDeviceBuffer(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 0>{}, + std::array, 0>{}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + ck::index_t M = ck::accumulate_n( + e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{}); + + ck::index_t N = ck::accumulate_n( + e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{}); + + ck::index_t K = ck::accumulate_n( + a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{}); + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + return 0; +} diff --git a/client_example/05_layernorm/CMakeLists.txt b/client_example/05_layernorm/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b582b485d4ce46951aaed98c6256e1c997388bd3 --- /dev/null +++ b/client_example/05_layernorm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_layernorm2d layernorm2d.cpp) +target_link_libraries(client_layernorm2d PRIVATE composable_kernel::device_operations) diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..856a4cc21935f094bfd7040ea095fb04d78b7eb4 --- /dev/null +++ b/client_example/05_layernorm/layernorm2d.cpp @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/normalization.hpp" + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ComputeDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + ck::index_t M = 1024; + ck::index_t N = 1024; + ck::index_t Stride = 1024; + + auto xy_size = (M - 1) * Stride + N; + + SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size); + SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N); + SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N); + SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size); + + using DeviceOp = ck::tensor_operation::device::DeviceNormalization; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths + {Stride, 1}, // xStrides + {0, 1}, // gammaStrides + {0, 1}, // betaStrides + {Stride, 1}, // yStrides + {1}, // reduceDims + 1e-4, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + nullptr, + nullptr, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N + + sizeof(BetaDataType) * N + sizeof(YDataType) * M * N; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths + {Stride, 1}, // xStrides + {1}, // gammaStrides + {1}, // betaStrides + {Stride, 1}, // yStrides + {1}, // reduceDims + 1e-4, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + nullptr, + nullptr, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/06_softmax/CMakeLists.txt b/client_example/06_softmax/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b38a0fd9e27570e62df7f701572a24fb4ee842f9 --- /dev/null +++ b/client_example/06_softmax/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_softmax4d softmax4d.cpp) +target_link_libraries(client_softmax4d PRIVATE composable_kernel::device_operations) diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e939ce8dfedb10166b15388cae1d659d33e2154a --- /dev/null +++ b/client_example/06_softmax/softmax4d.cpp @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/softmax.hpp" + +using InDataType = ck::half_t; +using OutDataType = ck::half_t; +using AccDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 4; +constexpr int NumReduceDim = 2; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::vector in_lengths{2, 8, 128, 1024}; + std::vector in_strides{8 * 128 * 1024, 128 * 1024, 1024, 1}; + std::vector reduce_dims{2, 3}; + + ck::index_t num_elements = + std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies()); + + double alpha{2.0}; + double beta{2.0}; + + SimpleDeviceMem in(sizeof(InDataType) * num_elements); + SimpleDeviceMem out(sizeof(OutDataType) * num_elements); + + using DeviceOp = ck::tensor_operation::device:: + DeviceSoftmax; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim) + { + continue; + } + + auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + reduce_dims, + alpha, + beta, + in.GetDeviceBuffer(), + out.GetDeviceBuffer(), + PassThrough{}, + PassThrough{}); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = num_elements * sizeof(InDataType) + + (beta == 0.0f ? 1 : 2) * num_elements * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + reduce_dims, + alpha, + beta, + in.GetDeviceBuffer(), + out.GetDeviceBuffer(), + PassThrough{}, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/07_grouped_convnd_fwd/CMakeLists.txt b/client_example/07_grouped_convnd_fwd/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fce7e91c1ef5b23fcfecfe99cc49263a1d779d76 --- /dev/null +++ b/client_example/07_grouped_convnd_fwd/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp) +target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations) + +add_executable(client_grouped_conv1d_fwd grouped_conv1d_fwd.cpp) +target_link_libraries(client_grouped_conv1d_fwd PRIVATE composable_kernel::device_operations) diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp b/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9fbdb83b1cf10082958730e5115f3bf54e12d415 --- /dev/null +++ b/client_example/07_grouped_convnd_fwd/grouped_conv1d_fwd.cpp @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using OutDataType = ck::half_t; + +using InLayout = ck::tensor_layout::convolution::GNWC; +using WeiLayout = ck::tensor_layout::convolution::GKXC; +using OutLayout = ck::tensor_layout::convolution::GNWK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr ck::index_t NumDimSpatial = 1; +static constexpr ck::index_t G = 32; +static constexpr ck::index_t N = 256; +static constexpr ck::index_t K = 192; +static constexpr ck::index_t C = 192; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Wi = 28; +static constexpr ck::index_t Wo = 28; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main() +{ + std::array in_lengths{G, N, Wi, C}; + std::array in_strides{0, 0, 0, 1}; + + std::array wei_lengths{G, K, X, C}; + std::array wei_strides{0, 0, 0, 1}; + + std::array out_lengths{G, N, Wo, K}; + std::array out_strides{0, 0, 0, 1}; + + std::partial_sum(rbegin(in_lengths), + std::prev(rend(in_lengths)), + std::next(rbegin(in_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(wei_lengths), + std::prev(rend(wei_lengths)), + std::next(rbegin(wei_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(out_lengths), + std::prev(rend(out_lengths)), + std::next(rbegin(out_strides)), + std::multiplies<>{}); + + // transpose GNWC/GKXC/GNWK to GNCW/GKCX/GNCW + std::rotate(rbegin(in_lengths), + std::next(rbegin(in_lengths)), + std::next(rbegin(in_lengths), NumDimSpatial + 1)); + std::rotate(rbegin(in_strides), + std::next(rbegin(in_strides)), + std::next(rbegin(in_strides), NumDimSpatial + 1)); + std::rotate(rbegin(wei_lengths), + std::next(rbegin(wei_lengths)), + std::next(rbegin(wei_lengths), NumDimSpatial + 1)); + std::rotate(rbegin(wei_strides), + std::next(rbegin(wei_strides)), + std::next(rbegin(wei_strides), NumDimSpatial + 1)); + std::rotate(rbegin(out_lengths), + std::next(rbegin(out_lengths)), + std::next(rbegin(out_lengths), NumDimSpatial + 1)); + std::rotate(rbegin(out_strides), + std::next(rbegin(out_strides)), + std::next(rbegin(out_strides), NumDimSpatial + 1)); + + std::array filter_strides{1}; + std::array filter_dilations{1}; + std::array input_left_pads{1}; + std::array input_right_pads{1}; + + SimpleDeviceMem in(sizeof(InDataType) * G * N * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * G * N * Wo * K); + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + PassThrough, + PassThrough, + PassThrough>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + {}, + {}, + out_lengths, + out_strides, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = std::size_t(2) * G * N * K * C * Wo * X; + std::size_t num_bytes = sizeof(InDataType) * G * N * Wi * C + + sizeof(WeiDataType) * G * K * X * C + + sizeof(OutDataType) * G * N * Wo * K; + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cerr << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance" << std::endl; + return EXIT_FAILURE; + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + {}, + {}, + out_lengths, + out_strides, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } +} diff --git a/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ece6e30c56004f58ff136502b4d8874e8269cd3a --- /dev/null +++ b/client_example/07_grouped_convnd_fwd/grouped_conv2d_fwd.cpp @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using OutDataType = ck::half_t; + +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using OutLayout = ck::tensor_layout::convolution::GNHWK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 32; +static constexpr ck::index_t N = 256; +static constexpr ck::index_t K = 192; +static constexpr ck::index_t C = 192; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 28; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main() +{ + std::array in_lengths{G, N, Hi, Wi, C}; + std::array in_strides{0, 0, 0, 0, 1}; + + std::array wei_lengths{G, K, Y, X, C}; + std::array wei_strides{0, 0, 0, 0, 1}; + + std::array out_lengths{G, N, Ho, Wo, K}; + std::array out_strides{0, 0, 0, 0, 1}; + + std::partial_sum(rbegin(in_lengths), + std::prev(rend(in_lengths)), + std::next(rbegin(in_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(wei_lengths), + std::prev(rend(wei_lengths)), + std::next(rbegin(wei_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(out_lengths), + std::prev(rend(out_lengths)), + std::next(rbegin(out_strides)), + std::multiplies<>{}); + + // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW + std::rotate( + rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3)); + std::rotate( + rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3)); + std::rotate( + rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3)); + std::rotate( + rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3)); + std::rotate( + rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3)); + std::rotate( + rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3)); + + std::array filter_strides{1, 1}; + std::array filter_dilations{1, 1}; + std::array input_left_pads{1, 1}; + std::array input_right_pads{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K); + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + PassThrough, + PassThrough, + PassThrough>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + {}, + {}, + out_lengths, + out_strides, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X; + std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C + + sizeof(WeiDataType) * G * K * Y * X * C + + sizeof(OutDataType) * G * N * Ho * Wo * K; + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cerr << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance" << std::endl; + return EXIT_FAILURE; + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + {}, + {}, + out_lengths, + out_strides, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } +} diff --git a/client_example/08_fused_attention/CMakeLists.txt b/client_example/08_fused_attention/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..862b9ed5b70ee8a35c4ba44e2ccf1441e1532bd3 --- /dev/null +++ b/client_example/08_fused_attention/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(client_fused_attention fused_attention.cpp) +target_link_libraries(client_fused_attention PRIVATE composable_kernel::device_operations) + +add_executable(client_fused_attention_bias fused_attention_bias.cpp) +target_link_libraries(client_fused_attention_bias PRIVATE composable_kernel::device_operations) diff --git a/client_example/08_fused_attention/fused_attention.cpp b/client_example/08_fused_attention/fused_attention.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fe927da1248786a4b943f610ce38b75f0d88defd --- /dev/null +++ b/client_example/08_fused_attention/fused_attention.cpp @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using AElementOp = ck::tensor_operation::element_wise::PassThrough; +using B0ElementOp = ck::tensor_operation::element_wise::PassThrough; +using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; +using B1ElementOp = ck::tensor_operation::element_wise::PassThrough; +using CElementOp = ck::tensor_operation::element_wise::PassThrough; + +constexpr static auto MaskingSpec = + ck::tensor_operation::device::MaskingSpecialization::MaskDisabled; + +using ADataType = ck::half_t; +using B0DataType = ck::half_t; +using B1DataType = ck::half_t; +using CDataType = ck::half_t; +using AccDataType = float; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + int G0 = 48; + int G1 = 16; + int M = 1024; + int N = 1024; + int K = 64; + int O = 64; + + // A layout [G0, M, G1, K] + std::vector a_gs_ms_ks_lengths{G0, G1, M, K}; + std::vector a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1}; + + // B0 layout [G0, N, G1, K] + std::vector b0_gs_ns_ks_lengths{G0, G1, N, K}; + std::vector b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1}; + + // B1 layout [G0, N, G1, O] + std::vector b1_gs_os_ns_lengths{G0, G1, O, N}; + std::vector b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O}; + + // C layout [G0, M, G1, O] + std::vector c_gs_ms_os_lengths{G0, G1, M, O}; + std::vector c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1}; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K); + SimpleDeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K); + SimpleDeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N); + SimpleDeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O); + + using DeviceOp = + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2, + 1, + 1, + 1, + 1, + ADataType, + B0DataType, + B1DataType, + CDataType, + ck::Tuple<>, + ck::Tuple<>, + AElementOp, + B0ElementOp, + Acc0ElementOp, + B1ElementOp, + CElementOp, + MaskingSpec>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device op instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b0_device_buf.GetDeviceBuffer(), + b1_device_buf.GetDeviceBuffer(), + c_device_buf.GetDeviceBuffer(), + {}, // p_acc0_biases + {}, // p_acc1_biases + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b0_gs_ns_ks_lengths, + b0_gs_ns_ks_strides, + b1_gs_os_ns_lengths, + b1_gs_os_ns_strides, + c_gs_ms_os_lengths, + c_gs_ms_os_strides, + {}, // acc0_biases_gs_ms_ns_lengths + {}, // acc0_biases_gs_ms_ns_strides + {}, // acc1_biases_gs_ms_os_lengths + {}, // acc1_biases_gs_ms_os_strides + AElementOp{}, + B0ElementOp{}, + Acc0ElementOp{1 / sqrtf(K)}, + B1ElementOp{}, + CElementOp{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1; + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + + sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * + G0 * G1; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best instance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b0_device_buf.GetDeviceBuffer(), + b1_device_buf.GetDeviceBuffer(), + c_device_buf.GetDeviceBuffer(), + {}, // p_acc0_biases + {}, // p_acc1_biases + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b0_gs_ns_ks_lengths, + b0_gs_ns_ks_strides, + b1_gs_os_ns_lengths, + b1_gs_os_ns_strides, + c_gs_ms_os_lengths, + c_gs_ms_os_strides, + {}, // acc0_biases_gs_ms_ns_lengths + {}, // acc0_biases_gs_ms_ns_strides + {}, // acc1_biases_gs_ms_os_lengths + {}, // acc1_biases_gs_ms_os_strides + AElementOp{}, + B0ElementOp{}, + Acc0ElementOp{1 / sqrtf(K)}, + B1ElementOp{}, + CElementOp{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/08_fused_attention/fused_attention_bias.cpp b/client_example/08_fused_attention/fused_attention_bias.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3113b7856025af74c610981db130a8d965c36a24 --- /dev/null +++ b/client_example/08_fused_attention/fused_attention_bias.cpp @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using AElementOp = ck::tensor_operation::element_wise::PassThrough; +using B0ElementOp = ck::tensor_operation::element_wise::PassThrough; +using Acc0ElementOp = ck::tensor_operation::element_wise::ScaleAdd; +using B1ElementOp = ck::tensor_operation::element_wise::PassThrough; +using CElementOp = ck::tensor_operation::element_wise::PassThrough; + +constexpr static auto MaskingSpec = + ck::tensor_operation::device::MaskingSpecialization::MaskDisabled; + +using ADataType = ck::half_t; +using B0DataType = ck::half_t; +using B1DataType = ck::half_t; +using CDataType = ck::half_t; +using D0DataType = ck::half_t; +using AccDataType = float; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + int G0 = 48; + int G1 = 16; + int M = 1024; + int N = 1024; + int K = 64; + int O = 64; + + // A layout [G0, M, G1, K] + std::vector a_gs_ms_ks_lengths{G0, G1, M, K}; + std::vector a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1}; + + // B0 layout [G0, N, G1, K] + std::vector b0_gs_ns_ks_lengths{G0, G1, N, K}; + std::vector b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1}; + + // B1 layout [G0, N, G1, O] + std::vector b1_gs_os_ns_lengths{G0, G1, O, N}; + std::vector b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O}; + + // C layout [G0, M, G1, O] + std::vector c_gs_ms_os_lengths{G0, G1, M, O}; + std::vector c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1}; + + // D layout [G0, M, G1, N] + std::vector d0_gs_ms_ns_lengths{G0, G1, M, N}; + std::vector d0_gs_ms_ns_strides{M * G1 * N, N, G1 * N, 1}; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K); + SimpleDeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K); + SimpleDeviceMem d0_device_buf(sizeof(D0DataType) * G0 * G1 * M * N); + SimpleDeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N); + SimpleDeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O); + + using DeviceOp = + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2, + 1, + 1, + 1, + 1, + ADataType, + B0DataType, + B1DataType, + CDataType, + ck::Tuple, + ck::Tuple<>, + AElementOp, + B0ElementOp, + Acc0ElementOp, + B1ElementOp, + CElementOp, + MaskingSpec>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device op instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b0_device_buf.GetDeviceBuffer(), + b1_device_buf.GetDeviceBuffer(), + c_device_buf.GetDeviceBuffer(), + std::array{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases + {}, // p_acc1_biases + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b0_gs_ns_ks_lengths, + b0_gs_ns_ks_strides, + b1_gs_os_ns_lengths, + b1_gs_os_ns_strides, + c_gs_ms_os_lengths, + c_gs_ms_os_strides, + std::array, 1>{ + d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths + std::array, 1>{ + d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides + {}, // acc1_biases_gs_ms_os_lengths + {}, // acc1_biases_gs_ms_os_strides + AElementOp{}, + B0ElementOp{}, + Acc0ElementOp{1 / sqrtf(K)}, + B1ElementOp{}, + CElementOp{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1; + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + + sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O + + sizeof(D0DataType) * M * N) * + G0 * G1; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best instance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b0_device_buf.GetDeviceBuffer(), + b1_device_buf.GetDeviceBuffer(), + c_device_buf.GetDeviceBuffer(), + std::array{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases + {}, // p_acc1_biases + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b0_gs_ns_ks_lengths, + b0_gs_ns_ks_strides, + b1_gs_os_ns_lengths, + b1_gs_os_ns_strides, + c_gs_ms_os_lengths, + c_gs_ms_os_strides, + std::array, 1>{ + d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths + std::array, 1>{ + d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides + {}, // acc1_biases_gs_ms_os_lengths + {}, // acc1_biases_gs_ms_os_strides + AElementOp{}, + B0ElementOp{}, + Acc0ElementOp{1 / sqrtf(K)}, + B1ElementOp{}, + CElementOp{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/09_quantization/CMakeLists.txt b/client_example/09_quantization/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dc9b860c0c427a21e6127fcce5556dbb06089e9 --- /dev/null +++ b/client_example/09_quantization/CMakeLists.txt @@ -0,0 +1,11 @@ +add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp) +target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations) + +add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp) +target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations) + +add_executable(client_conv2d_fwd_perchannel_quantization conv2d_fwd_perchannel_quantization.cpp) +target_link_libraries(client_conv2d_fwd_perchannel_quantization PRIVATE composable_kernel::device_operations) + +add_executable(client_conv2d_fwd_perlayer_quantization conv2d_fwd_perlayer_quantization.cpp) +target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable_kernel::device_operations) diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bcb0cefa712cf144edf1916adf0c0f97515d56f0 --- /dev/null +++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using InDataType = int8_t; +using WeiDataType = int8_t; +using BiasDataType = int32_t; +using RequantScaleDataType = float; +using OutDataType = int8_t; + +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using BiasLayout = ck::tensor_layout::convolution::G_K; +using RequantScaleLayout = ck::tensor_layout::convolution::G_K; +using OutLayout = ck::tensor_layout::convolution::GNHWK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using ActivationOp = ck::tensor_operation::element_wise::Relu; +using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 4; +static constexpr ck::index_t K = 64; +static constexpr ck::index_t C = 32; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Hi = 71; +static constexpr ck::index_t Wi = 71; +static constexpr ck::index_t Ho = 36; +static constexpr ck::index_t Wo = 36; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array in_lengths{G, N, C, Hi, Wi}; + std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; + std::array weight_lengths{G, K, C, Y, X}; + std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; + std::array bias_lengths{G, N, K, Ho, Wo}; + std::array bias_strides{K, 0, 1, 0, 0}; + std::array requant_scale_lengths{G, N, K, Ho, Wo}; + std::array requant_scale_strides{K, 0, 1, 0, 0}; + std::array out_lengths{G, N, C, Ho, Wo}; + std::array out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C}; + std::array in_left_pad{1, 1}; + std::array in_right_pad{1, 1}; + std::array conv_strides{2, 2}; + std::array conv_dilations{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); + SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); + SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< + NumDimSpatial, + InLayout, + WeiLayout, + ck::Tuple, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + PassThrough, + PassThrough, + OutElementOp>; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = + op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths, requant_scale_lengths}, + {bias_strides, requant_scale_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X; + std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C + + G * sizeof(WeiDataType) * K * Y * X * C + + G * sizeof(OutDataType) * N * Ho * Wo * K; + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = + op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths, requant_scale_lengths}, + {bias_strides, requant_scale_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp new file mode 100644 index 0000000000000000000000000000000000000000..26c7aa15e2be0814b20b17f9c2c91fe21e70d961 --- /dev/null +++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using InDataType = int8_t; +using WeiDataType = int8_t; +using BiasDataType = int32_t; +using OutDataType = int8_t; + +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using BiasLayout = ck::tensor_layout::convolution::G_K; +using OutLayout = ck::tensor_layout::convolution::GNHWK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using ActivationOp = ck::tensor_operation::element_wise::Relu; +using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 4; +static constexpr ck::index_t K = 64; +static constexpr ck::index_t C = 32; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Hi = 71; +static constexpr ck::index_t Wi = 71; +static constexpr ck::index_t Ho = 36; +static constexpr ck::index_t Wo = 36; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array in_lengths{G, N, C, Hi, Wi}; + std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; + std::array weight_lengths{G, K, C, Y, X}; + std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; + std::array bias_lengths{G, N, K, Ho, Wo}; + std::array bias_strides{K, 0, 1, 0, 0}; + std::array out_lengths{G, N, C, Ho, Wo}; + std::array out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C}; + std::array in_left_pad{1, 1}; + std::array in_right_pad{1, 1}; + std::array conv_strides{2, 2}; + std::array conv_dilations{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); + SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); + + using DeviceOp = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + PassThrough, + PassThrough, + OutElementOp>; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths}, + {bias_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{0.5f, ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X; + std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C + + G * sizeof(WeiDataType) * K * Y * X * C + + G * sizeof(OutDataType) * N * Ho * Wo * K; + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths}, + {bias_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{0.5f, ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp new file mode 100644 index 0000000000000000000000000000000000000000..475b2f03b4f558ad4bc319393472b926ebbfee2d --- /dev/null +++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using InDataType = int8_t; +using WeiDataType = int8_t; +using RequantScaleDataType = float; +using OutDataType = int8_t; + +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using RequantScaleLayout = ck::tensor_layout::convolution::G_K; +using OutLayout = ck::tensor_layout::convolution::GNHWK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using ActivationOp = PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 4; +static constexpr ck::index_t K = 64; +static constexpr ck::index_t C = 32; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Hi = 71; +static constexpr ck::index_t Wi = 71; +static constexpr ck::index_t Ho = 36; +static constexpr ck::index_t Wo = 36; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array in_lengths{G, N, C, Hi, Wi}; + std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; + std::array weight_lengths{G, K, C, Y, X}; + std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; + std::array requant_scale_lengths{G, N, K, Ho, Wo}; + std::array requant_scale_strides{K, 0, 1, 0, 0}; + std::array out_lengths{G, N, C, Ho, Wo}; + std::array out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C}; + std::array in_left_pad{1, 1}; + std::array in_right_pad{1, 1}; + std::array conv_strides{2, 2}; + std::array conv_dilations{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); + SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); + + using DeviceOp = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + PassThrough, + PassThrough, + OutElementOp>; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {requant_scale.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {requant_scale_lengths}, + {requant_scale_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X; + std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C + + G * sizeof(WeiDataType) * K * Y * X * C + + G * sizeof(OutDataType) * N * Ho * Wo * K; + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {}, + {}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp new file mode 100644 index 0000000000000000000000000000000000000000..da7b7e6abffd1f5033e4a02700a1a939a950dc0e --- /dev/null +++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using InDataType = int8_t; +using WeiDataType = int8_t; +using OutDataType = int8_t; + +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using OutLayout = ck::tensor_layout::convolution::GNHWK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using ActivationOp = PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 4; +static constexpr ck::index_t K = 64; +static constexpr ck::index_t C = 32; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Hi = 71; +static constexpr ck::index_t Wi = 71; +static constexpr ck::index_t Ho = 36; +static constexpr ck::index_t Wo = 36; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array in_lengths{G, N, C, Hi, Wi}; + std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; + std::array weight_lengths{G, K, C, Y, X}; + std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; + std::array out_lengths{G, N, C, Ho, Wo}; + std::array out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C}; + std::array in_left_pad{1, 1}; + std::array in_right_pad{1, 1}; + std::array conv_strides{2, 2}; + std::array conv_dilations{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + PassThrough, + PassThrough, + OutElementOp>; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {}, + {}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{0.5f, ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X; + std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C + + G * sizeof(WeiDataType) * K * Y * X * C + + G * sizeof(OutDataType) * N * Ho * Wo * K; + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {}, + {}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{0.5f, ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt b/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e564f3180d8c9295356d90266f2159de47aac060 --- /dev/null +++ b/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp) +target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_operations) diff --git a/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp new file mode 100644 index 0000000000000000000000000000000000000000..55c789804230ccccf66d68be9244c5c4111451e6 --- /dev/null +++ b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using OutDataType = ck::half_t; + +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using OutLayout = ck::tensor_layout::convolution::GNHWK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 32; +static constexpr ck::index_t N = 256; +static constexpr ck::index_t K = 192; +static constexpr ck::index_t C = 192; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 28; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main() +{ + std::array in_lengths{G, N, Hi, Wi, C}; + std::array in_strides{0, 0, 0, 0, 1}; + + std::array wei_lengths{G, K, Y, X, C}; + std::array wei_strides{0, 0, 0, 0, 1}; + + std::array out_lengths{G, N, Ho, Wo, K}; + std::array out_strides{0, 0, 0, 0, 1}; + + std::partial_sum(rbegin(in_lengths), + std::prev(rend(in_lengths)), + std::next(rbegin(in_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(wei_lengths), + std::prev(rend(wei_lengths)), + std::next(rbegin(wei_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(out_lengths), + std::prev(rend(out_lengths)), + std::next(rbegin(out_strides)), + std::multiplies<>{}); + + // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW + std::rotate( + rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3)); + std::rotate( + rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3)); + std::rotate( + rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3)); + std::rotate( + rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3)); + std::rotate( + rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3)); + std::rotate( + rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3)); + + std::array filter_strides{1, 1}; + std::array filter_dilations{1, 1}; + std::array input_left_pads{1, 1}; + std::array input_right_pads{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K); + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD, + InLayout, + OutDataType, + WeiDataType, + ck::Tuple<>, + InDataType, + PassThrough, + PassThrough, + PassThrough>; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + in.GetDeviceBuffer(), + out_lengths, + out_strides, + wei_lengths, + wei_strides, + {}, + {}, + in_lengths, + in_strides, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X; + std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C + + sizeof(WeiDataType) * G * K * Y * X * C + + sizeof(OutDataType) * G * N * Ho * Wo * K; + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cerr << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance" << std::endl; + return EXIT_FAILURE; + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + in.GetDeviceBuffer(), + out_lengths, + out_strides, + wei_lengths, + wei_strides, + {}, + {}, + in_lengths, + in_strides, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } +} diff --git a/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt b/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..82162b606a6ee055cb985b960366a93fa4dd21e3 --- /dev/null +++ b/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt @@ -0,0 +1,9 @@ +add_executable(client_grouped_conv1d_bwd_weight_fp16 grouped_conv1d_bwd_weight_fp16.cpp) +add_executable(client_grouped_conv2d_bwd_weight_fp16 grouped_conv2d_bwd_weight_fp16.cpp) +add_executable(client_grouped_conv3d_bwd_weight_fp16 grouped_conv3d_bwd_weight_fp16.cpp) +add_executable(client_grouped_conv3d_bwd_weight_fp32 grouped_conv3d_bwd_weight_fp32.cpp) + +target_link_libraries(client_grouped_conv1d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations) +target_link_libraries(client_grouped_conv2d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations) +target_link_libraries(client_grouped_conv3d_bwd_weight_fp16 PRIVATE composable_kernel::device_operations) +target_link_libraries(client_grouped_conv3d_bwd_weight_fp32 PRIVATE composable_kernel::device_operations) diff --git a/client_example/11_grouped_conv_bwd_weight/common.hpp b/client_example/11_grouped_conv_bwd_weight/common.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a906263333c8a15947e12fe8702e431d4acb7999 --- /dev/null +++ b/client_example/11_grouped_conv_bwd_weight/common.hpp @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +template +std::size_t GetFlops(ck::index_t G, + ck::index_t N, + ck::index_t K, + ck::index_t C, + const std::array& output_spatial_lengths, + const std::array& filter_spatial_lengths) +{ + // 2 * G * N * K * C * * + return static_cast(2) * G * N * K * C * + std::accumulate(std::begin(output_spatial_lengths), + std::end(output_spatial_lengths), + static_cast(1), + std::multiplies<>()) * + std::accumulate(std::begin(filter_spatial_lengths), + std::end(filter_spatial_lengths), + static_cast(1), + std::multiplies<>()); +} + +template +std::size_t GetInputByte(ck::index_t G, + ck::index_t N, + ck::index_t C, + const std::array& input_spatial_lengths) +{ + // sizeof(InDataType) * (G * N * C * ) + + return sizeof(InDataType) * (G * N * C * + std::accumulate(std::begin(input_spatial_lengths), + std::end(input_spatial_lengths), + static_cast(1), + std::multiplies<>())); +} + +template +std::size_t GetWeightByte(ck::index_t G, + ck::index_t K, + ck::index_t C, + const std::array& filter_spatial_lengths) +{ + // sizeof(WeiDataType) * (G * K * C * ) + + return sizeof(WeiDataType) * (G * K * C * + std::accumulate(std::begin(filter_spatial_lengths), + std::end(filter_spatial_lengths), + static_cast(1), + std::multiplies<>())); +} + +template +std::size_t GetOutputByte(ck::index_t G, + ck::index_t N, + ck::index_t K, + const std::array& output_spatial_lengths) +{ + // sizeof(OutDataType) * (G * N * K * ); + return sizeof(OutDataType) * (G * N * K * + std::accumulate(std::begin(output_spatial_lengths), + std::end(output_spatial_lengths), + static_cast(1), + std::multiplies())); +} + +template +bool run_grouped_conv_bwd_weight( + ck::index_t G, + ck::index_t N, + ck::index_t K, + ck::index_t C, + const std::array& input_spatial_lengths, + const std::array& filter_spatial_lengths, + const std::array& output_spatial_lengths, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads) +{ + + ck::index_t split_k = 2; + SimpleDeviceMem in(GetInputByte(G, N, C, input_spatial_lengths)); + SimpleDeviceMem wei(GetWeightByte(G, K, C, filter_spatial_lengths)); + SimpleDeviceMem out(GetOutputByte(G, N, K, output_spatial_lengths)); + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + out.GetDeviceBuffer(), + G, + N, + K, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}, + split_k); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = + GetFlops(G, N, K, C, output_spatial_lengths, filter_spatial_lengths); + std::size_t num_bytes = + GetInputByte(G, N, C, input_spatial_lengths) + + GetWeightByte(G, K, C, filter_spatial_lengths) + + GetOutputByte(G, N, K, output_spatial_lengths); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cerr << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance" << std::endl; + return false; + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + out.GetDeviceBuffer(), + G, + N, + K, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}, + split_k); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return true; +} diff --git a/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp b/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..788d50ddefb704d9c08c081ae24f40480882a04f --- /dev/null +++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv1d_bwd_weight_fp16.cpp @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using OutDataType = ck::half_t; + +using InLayout = ck::tensor_layout::convolution::GNWC; +using WeiLayout = ck::tensor_layout::convolution::GKXC; +using OutLayout = ck::tensor_layout::convolution::GNWK; + +static constexpr ck::index_t NumDimSpatial = 1; +static constexpr ck::index_t G = 32; +static constexpr ck::index_t N = 256; +static constexpr ck::index_t K = 192; +static constexpr ck::index_t C = 192; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Wi = 28; +static constexpr ck::index_t Wo = 28; + +int main() +{ + return run_grouped_conv_bwd_weight(G, N, K, C, {Wi}, {X}, {Wo}, {1}, {1}, {1}, {1}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1903bd95b67b0834c73047cde0cdae6fe4e7fe82 --- /dev/null +++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight_fp16.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using OutDataType = ck::half_t; + +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using OutLayout = ck::tensor_layout::convolution::GNHWK; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 32; +static constexpr ck::index_t N = 256; +static constexpr ck::index_t K = 192; +static constexpr ck::index_t C = 192; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 28; + +int main() +{ + return run_grouped_conv_bwd_weight( + G, N, K, C, {Hi, Wi}, {Y, X}, {Ho, Wo}, {1, 1}, {1, 1}, {1, 1}, {1, 1}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2f2b5d4e2113c090c0e171a4204f3f9cfde2d276 --- /dev/null +++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp16.cpp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using OutDataType = ck::half_t; + +using InLayout = ck::tensor_layout::convolution::GNDHWC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::GNDHWK; + +static constexpr ck::index_t NumDimSpatial = 3; +static constexpr ck::index_t G = 8; +static constexpr ck::index_t N = 64; +static constexpr ck::index_t K = 128; +static constexpr ck::index_t C = 128; +static constexpr ck::index_t Z = 3; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Di = 28; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 3; +static constexpr ck::index_t Do = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 3; + +int main() +{ + return run_grouped_conv_bwd_weight(G, + N, + K, + C, + {Di, Hi, Wi}, + {Z, Y, X}, + {Do, Ho, Wo}, + {1, 1, 1}, + {1, 1, 1}, + {1, 1, 1}, + {1, 1, 1}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp new file mode 100644 index 0000000000000000000000000000000000000000..796311d2318e7d86198fd8ea631680876028c985 --- /dev/null +++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv3d_bwd_weight_fp32.cpp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = float; +using WeiDataType = float; +using OutDataType = float; + +using InLayout = ck::tensor_layout::convolution::GNDHWC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::GNDHWK; + +static constexpr ck::index_t NumDimSpatial = 3; +static constexpr ck::index_t G = 8; +static constexpr ck::index_t N = 64; +static constexpr ck::index_t K = 128; +static constexpr ck::index_t C = 128; +static constexpr ck::index_t Z = 3; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Di = 28; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 3; +static constexpr ck::index_t Do = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 3; + +int main() +{ + return run_grouped_conv_bwd_weight(G, + N, + K, + C, + {Di, Hi, Wi}, + {Z, Y, X}, + {Do, Ho, Wo}, + {1, 1, 1}, + {1, 1, 1}, + {1, 1, 1}, + {1, 1, 1}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/client_example/12_elementwise_normalization/CMakeLists.txt b/client_example/12_elementwise_normalization/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ba0e1279a402ae00a6e3aead9de6c12c61e14c9 --- /dev/null +++ b/client_example/12_elementwise_normalization/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_elementwise_layernorm2d elementwise_layernorm2d.cpp) +target_link_libraries(client_elementwise_layernorm2d PRIVATE composable_kernel::device_operations) diff --git a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de68f46d398958917e49bf14178f66414590ed86 --- /dev/null +++ b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp" + +using ADataType = ck::half_t; // Input 1 +using BDataType = ck::half_t; // Input 2 +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using AccDataType = float; +using XElementwiseOperation = ck::tensor_operation::element_wise::Add; +using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main() +{ + bool time_kernel = true; + + ck::index_t M = 48 * 256; + ck::index_t N = 1024; + ck::index_t Stride = N; + + auto mn_size = (M - 1) * Stride + N; + + SimpleDeviceMem a_dev_buf(sizeof(ADataType) * mn_size); + SimpleDeviceMem b_dev_buf(sizeof(BDataType) * mn_size); + SimpleDeviceMem gamma_dev_buf(sizeof(GammaDataType) * N); + SimpleDeviceMem beta_dev_buf(sizeof(BetaDataType) * N); + SimpleDeviceMem y_dev_buf(sizeof(YDataType) * mn_size); + + std::array ab_input = {a_dev_buf.GetDeviceBuffer(), + b_dev_buf.GetDeviceBuffer()}; + std::vector abStride = {Stride, 1}; + std::array, 2> abStrides = {abStride, abStride}; + + using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization< + ck::Tuple, + GammaDataType, + BetaDataType, + AccDataType, + YDataType, + XElementwiseOperation, + YElementwiseOperation, + Rank, + NumReduceDim>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths + abStrides, + {0, 1}, // gammaStrides + {0, 1}, // betaStrides + {Stride, 1}, // yStrides + {1}, // reduceDims + 1e-4, + ab_input, + gamma_dev_buf.GetDeviceBuffer(), + beta_dev_buf.GetDeviceBuffer(), + y_dev_buf.GetDeviceBuffer(), + XElementwiseOperation{}, + YElementwiseOperation{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_byte = sizeof(ADataType) * M * N + sizeof(BDataType) * M * N + + sizeof(GammaDataType) * N + sizeof(BetaDataType) * N + + sizeof(YDataType) * M * N; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths + abStrides, + {1}, // gammaStrides + {1}, // betaStrides + {Stride, 1}, // yStrides + {1}, // reduceDims + 1e-4, + ab_input, + gamma_dev_buf.GetDeviceBuffer(), + beta_dev_buf.GetDeviceBuffer(), + y_dev_buf.GetDeviceBuffer(), + XElementwiseOperation{}, + YElementwiseOperation{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/13_batchnorm/CMakeLists.txt b/client_example/13_batchnorm/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc4f9d395c4a8b45e34db2ad0c5622484d379dba --- /dev/null +++ b/client_example/13_batchnorm/CMakeLists.txt @@ -0,0 +1,6 @@ +add_executable(client_batchnorm_fwd_nhwc batchnorm_fwd_nhwc.cpp) +add_executable(client_batchnorm_bwd_nhwc batchnorm_bwd_nhwc.cpp) +add_executable(client_batchnorm_infer_nhwc batchnorm_infer_nhwc.cpp) +target_link_libraries(client_batchnorm_fwd_nhwc PRIVATE composable_kernel::device_operations) +target_link_libraries(client_batchnorm_bwd_nhwc PRIVATE composable_kernel::device_operations) +target_link_libraries(client_batchnorm_infer_nhwc PRIVATE composable_kernel::device_operations) diff --git a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8ef21986a4d9a5f1eb25e21a1073c4cc341da88d --- /dev/null +++ b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp" + +using XDataType = ck::half_t; +using DxDataType = float; +using DyDataType = float; +using AccDataType = float; +using ScaleDataType = ck::half_t; +using DscaleDbiasDataType = float; +using MeanVarDataType = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 4; +constexpr int NumBatchNormReduceDim = 3; + +const double epsilon = std::numeric_limits::epsilon(); + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array xyLengths{16, 8, 128, 256}; + std::array xyStrides{8 * 128 * 256, 128 * 256, 256, 1}; + std::array scaleBiasMeanVarLengths{256}; + std::array scaleBiasMeanVarStrides{1}; + std::array reduceDims{0, 1, 2}; + + ck::index_t numXYElement = + std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies()); + + ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + 1, + std::multiplies()); + + SimpleDeviceMem x(sizeof(XDataType) * numXYElement); + SimpleDeviceMem dy(sizeof(DyDataType) * numXYElement); + SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem dx(sizeof(DxDataType) * numXYElement); + SimpleDeviceMem dscale(sizeof(DscaleDbiasDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem dbias(sizeof(DscaleDbiasDataType) * numScaleBiasMeanVarElement); + + using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + xyStrides, + reduceDims, + scaleBiasMeanVarLengths, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + x.GetDeviceBuffer(), + dy.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + invVariance.GetDeviceBuffer(), + epsilon, + PassThrough{}, + dx.GetDeviceBuffer(), + dscale.GetDeviceBuffer(), + dbias.GetDeviceBuffer()); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + + SimpleDeviceMem workspace(workspace_sz); + + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = + numXYElement * (sizeof(XDataType) + sizeof(DyDataType) + sizeof(DxDataType)) + + numScaleBiasMeanVarElement * + (sizeof(ScaleDataType) + sizeof(DscaleDbiasDataType) * 2 + + sizeof(MeanVarDataType) * 2); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + if(found) + { + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + xyStrides, + reduceDims, + scaleBiasMeanVarLengths, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + x.GetDeviceBuffer(), + dy.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + invVariance.GetDeviceBuffer(), + epsilon, + PassThrough{}, + dx.GetDeviceBuffer(), + dscale.GetDeviceBuffer(), + dbias.GetDeviceBuffer()); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..322667a46bacae8d0c681939c3890ef9ff476b0e --- /dev/null +++ b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp" + +using XDataType = float; +using YDataType = float; +using AccDataType = float; +using ScaleDataType = AccDataType; +using BiasDataType = AccDataType; +using MeanVarDataType = AccDataType; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 4; +constexpr int NumBatchNormReduceDim = 3; + +const double epsilon = std::numeric_limits::epsilon(); +const double averageFactor = 0.1; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array xyLengths{16, 8, 128, 256}; + std::array xyStrides{8 * 128 * 256, 128 * 256, 256, 1}; + std::array scaleBiasMeanVarLengths{256}; + std::array scaleBiasMeanVarStrides{1}; + std::array reduceDims{0, 1, 2}; + + ck::index_t numXYElement = + std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies()); + + ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + 1, + std::multiplies()); + + SimpleDeviceMem x(sizeof(XDataType) * numXYElement); + SimpleDeviceMem y(sizeof(YDataType) * numXYElement); + SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + + using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + reduceDims, + scaleBiasMeanVarLengths, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + x.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer(), + epsilon, + PassThrough{}, + y.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + invVariance.GetDeviceBuffer(), + averageFactor, + nullptr, + nullptr); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + + SimpleDeviceMem workspace(workspace_sz); + + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = + numXYElement * (sizeof(XDataType) + sizeof(YDataType)) + + numScaleBiasMeanVarElement * (sizeof(ScaleDataType) + sizeof(BiasDataType) + + sizeof(MeanVarDataType) + sizeof(MeanVarDataType)); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + if(found) + { + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + reduceDims, + scaleBiasMeanVarLengths, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + x.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer(), + epsilon, + PassThrough{}, + y.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + invVariance.GetDeviceBuffer(), + averageFactor, + nullptr, + nullptr); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp b/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3117d162db71a0a4d02a15decac20e1f0f50d56e --- /dev/null +++ b/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp" + +using XDataType = float; +using YDataType = float; +using ScaleDataType = float; +using BiasDataType = float; +using MeanVarDataType = float; + +constexpr int Rank = 4; +constexpr int NumBatchNormReduceDim = 3; + +using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; + +const double epsilon = std::numeric_limits::epsilon(); + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array xyLengths{16, 8, 128, 256}; + std::array xyStrides{8 * 128 * 256, 128 * 256, 256, 1}; + std::array scaleBiasMeanVarLengths{256}; + std::array scaleBiasMeanVarStrides{1}; + std::array reduceDims{0, 1, 2}; + std::array invariantDims{3}; + + ck::index_t numXYElement = + std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies()); + + ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + 1, + std::multiplies()); + + SimpleDeviceMem x(sizeof(XDataType) * numXYElement); + SimpleDeviceMem y(sizeof(YDataType) * numXYElement); + SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem variance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + + // values in variance need be non-negative + (void)hipMemset( + variance.GetDeviceBuffer(), 0, sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + + std::array aligned_scaleBiasMeanVarStrides{0}; + + int i = 0; + for(auto dim : invariantDims) + { + assert(xyLengths[dim] == scaleBiasMeanVarLengths[i]); + + aligned_scaleBiasMeanVarStrides[dim] = scaleBiasMeanVarStrides[i]; + i++; + }; + + using DeviceOp = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, + ck::Tuple, + Normalize, + Rank>; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + {xyStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides}, + {xyStrides}, + {x.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + variance.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer()}, + {y.GetDeviceBuffer()}, + Normalize{epsilon}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = + numXYElement * (sizeof(XDataType) + sizeof(YDataType)) + + numScaleBiasMeanVarElement * (sizeof(ScaleDataType) + sizeof(BiasDataType) + + sizeof(MeanVarDataType) + sizeof(MeanVarDataType)); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + if(found) + { + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + {xyStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides}, + {xyStrides}, + {x.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + variance.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer()}, + {y.GetDeviceBuffer()}, + Normalize{epsilon}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/14_instance_id/CMakeLists.txt b/client_example/14_instance_id/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..87b2a9a0cbc82a82c9567266178ee431fb9149e7 --- /dev/null +++ b/client_example/14_instance_id/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_batchnorm_fwd_instance_id batchnorm_fwd_instance_id.cpp) +target_link_libraries(client_batchnorm_fwd_instance_id PRIVATE composable_kernel::device_operations) diff --git a/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9cfeee1cfe106e69f83dc0184f3956d6751a2947 --- /dev/null +++ b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp" + +using XDataType = float; +using YDataType = float; +using AccDataType = float; +using ScaleDataType = AccDataType; +using BiasDataType = AccDataType; +using MeanVarDataType = AccDataType; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 4; +constexpr int NumBatchNormReduceDim = 3; + +const double epsilon = std::numeric_limits::epsilon(); +const double averageFactor = 0.1; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +// In the actual application, the instance index and name are usually from the perf db +static int instance_index = -1; +static std::string instance_name; + +int main(int argc, char* argv[]) +{ + std::array xyLengths{16, 8, 128, 256}; + std::array xyStrides{8 * 128 * 256, 128 * 256, 256, 1}; + std::array scaleBiasMeanVarLengths{256}; + std::array scaleBiasMeanVarStrides{1}; + std::array reduceDims{0, 1, 2}; + + ck::index_t numXYElement = + std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies()); + + ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + 1, + std::multiplies()); + + SimpleDeviceMem x(sizeof(XDataType) * numXYElement); + SimpleDeviceMem y(sizeof(YDataType) * numXYElement); + SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + + using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + bool found = false; + int best_op_index = -1; + float best_ave_time = std::numeric_limits::max(); + + // profile device operation instances and save the best performant instance index and instance + // name + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + reduceDims, + scaleBiasMeanVarLengths, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + x.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer(), + epsilon, + PassThrough{}, + y.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + invVariance.GetDeviceBuffer(), + averageFactor, + nullptr, + nullptr); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + + SimpleDeviceMem workspace(workspace_sz); + + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + if(ave_time < best_ave_time) + { + found = true; + best_op_index = i; + best_ave_time = ave_time; + } + } + } + + if(found) + { + instance_index = best_op_index; + instance_name = op_ptrs[instance_index]->GetTypeIdHashCode(); + }; + + // simulate the execution of the operation when the instance index and name are available + const auto op_ptrs_2 = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + if(instance_index >= 0 && instance_index < op_ptrs_2.size()) + { + auto& op_ptr = op_ptrs_2[instance_index]; + + if(op_ptr->GetTypeIdHashCode() == instance_name) + { + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + reduceDims, + scaleBiasMeanVarLengths, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + x.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer(), + epsilon, + PassThrough{}, + y.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + invVariance.GetDeviceBuffer(), + averageFactor, + nullptr, + nullptr); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + + SimpleDeviceMem workspace(workspace_sz); + + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + float exec_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + size_t num_bytes = numXYElement * (sizeof(XDataType) + sizeof(YDataType)) + + numScaleBiasMeanVarElement * + (sizeof(ScaleDataType) + sizeof(BiasDataType) + + sizeof(MeanVarDataType) + sizeof(MeanVarDataType)); + + float gb_per_sec = num_bytes / 1.E6 / exec_time; + + std::cout << "Kernel execution time: " << std::setw(10) << exec_time + << " ms, effective data transfer bandwidth: " << gb_per_sec << " GB/s" + << std::endl; + } + }; + } + + return 0; +} diff --git a/client_example/15_convnd_bwd_data/CMakeLists.txt b/client_example/15_convnd_bwd_data/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a60a71674f84f6005deadf3744b7df346c0ac3c --- /dev/null +++ b/client_example/15_convnd_bwd_data/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(client_conv3d_bwd_data_fp16 conv3d_bwd_data_fp16.cpp) +add_executable(client_conv3d_bwd_data_fp32 conv3d_bwd_data_fp32.cpp) + +target_link_libraries(client_conv3d_bwd_data_fp16 PRIVATE composable_kernel::device_operations) +target_link_libraries(client_conv3d_bwd_data_fp32 PRIVATE composable_kernel::device_operations) diff --git a/client_example/15_convnd_bwd_data/common.hpp b/client_example/15_convnd_bwd_data/common.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9799fb73a5a05225f053de3305de736c485acf3a --- /dev/null +++ b/client_example/15_convnd_bwd_data/common.hpp @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +std::size_t GetFlops(ck::index_t N, + ck::index_t K, + ck::index_t C, + const std::vector& output_spatial_lengths, + const std::vector& weights_spatial_lengths) +{ + // 2 * N * K * C * * + + return static_cast(2) * N * K * C * + std::accumulate(std::begin(output_spatial_lengths), + std::end(output_spatial_lengths), + static_cast(1), + std::multiplies<>()) * + std::accumulate(std::begin(weights_spatial_lengths), + std::end(weights_spatial_lengths), + static_cast(1), + std::multiplies<>()); +} + +template +std::size_t +GetInputByte(ck::index_t N, ck::index_t C, const std::vector& input_spatial_lengths) +{ + // sizeof(InDataType) * (N * C * ) + + return sizeof(InDataType) * N * C * + std::accumulate(std::begin(input_spatial_lengths), + std::end(input_spatial_lengths), + static_cast(1), + std::multiplies<>()); +} + +template +std::size_t +GetWeightByte(ck::index_t K, ck::index_t C, const std::vector& weights_spatial_lengths) +{ + // sizeof(WeiDataType) * (K * C * ) + + return sizeof(WeiDataType) * K * C * + std::accumulate(std::begin(weights_spatial_lengths), + std::end(weights_spatial_lengths), + static_cast(1), + std::multiplies<>()); +} + +template +std::size_t +GetOutputByte(ck::index_t N, ck::index_t K, const std::vector& output_spatial_lengths) +{ + // sizeof(OutDataType) * (N * K * ); + return sizeof(OutDataType) * N * K * + std::accumulate(std::begin(output_spatial_lengths), + std::end(output_spatial_lengths), + static_cast(1), + std::multiplies()); +} + +template +bool run_conv_bwd_data(ck::index_t N, + ck::index_t K, + ck::index_t C, + const std::vector& in_spatial_lengths, + const std::vector& wei_spatial_lengths, + const std::vector& out_spatial_lengths) +{ + std::size_t in_mem_size = GetInputByte(N, C, in_spatial_lengths); + std::size_t wei_mem_size = GetWeightByte(K, C, wei_spatial_lengths); + std::size_t out_mem_size = GetOutputByte(N, K, out_spatial_lengths); + + SimpleDeviceMem in(in_mem_size); + SimpleDeviceMem wei(wei_mem_size); + SimpleDeviceMem out(out_mem_size); + + std::vector filter_strides(NumDimSpatial, 1); + std::vector filter_dilations(NumDimSpatial, 1); + std::vector input_left_pads(NumDimSpatial, 1); + std::vector input_right_pads(NumDimSpatial, 1); + + using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + std::size_t flop = GetFlops(N, K, C, out_spatial_lengths, wei_spatial_lengths); + std::size_t num_bytes = in_mem_size + wei_mem_size + out_mem_size; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + out.GetDeviceBuffer(), + N, + K, + C, + in_spatial_lengths, + wei_spatial_lengths, + out_spatial_lengths, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cerr << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance" << std::endl; + return false; + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + out.GetDeviceBuffer(), + N, + K, + C, + in_spatial_lengths, + wei_spatial_lengths, + out_spatial_lengths, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + return true; +} diff --git a/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp b/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5210567241ebd9ce84b8b0b576b295594bf8578b --- /dev/null +++ b/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp16.cpp @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using OutDataType = ck::half_t; + +using InLayout = ck::tensor_layout::convolution::NDHWC; +using WeiLayout = ck::tensor_layout::convolution::KZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWK; + +static constexpr ck::index_t NumDimSpatial = 3; +static constexpr ck::index_t N = 64; +static constexpr ck::index_t K = 128; +static constexpr ck::index_t C = 64; +static constexpr ck::index_t Z = 3; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Di = 28; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 28; +static constexpr ck::index_t Do = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 28; + +int main() +{ + return run_conv_bwd_data(N, K, C, {Di, Hi, Wi}, {Z, Y, X}, {Do, Ho, Wo}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp b/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp new file mode 100644 index 0000000000000000000000000000000000000000..441bdfe7bec3bd12fe948c2efc892ad5cc4d0184 --- /dev/null +++ b/client_example/15_convnd_bwd_data/conv3d_bwd_data_fp32.cpp @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = float; +using WeiDataType = float; +using OutDataType = float; + +using InLayout = ck::tensor_layout::convolution::NDHWC; +using WeiLayout = ck::tensor_layout::convolution::KZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWK; + +static constexpr ck::index_t NumDimSpatial = 3; +static constexpr ck::index_t N = 64; +static constexpr ck::index_t K = 128; +static constexpr ck::index_t C = 64; +static constexpr ck::index_t Z = 3; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Di = 28; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 28; +static constexpr ck::index_t Do = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 28; + +int main() +{ + return run_conv_bwd_data(N, K, C, {Di, Hi, Wi}, {Z, Y, X}, {Do, Ho, Wo}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/client_example/15_gemm_add_multiply/CMakeLists.txt b/client_example/15_gemm_add_multiply/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd2dcf9614016b93b43a52c2940d1701ee623584 --- /dev/null +++ b/client_example/15_gemm_add_multiply/CMakeLists.txt @@ -0,0 +1,3 @@ + +add_executable(client_gemm_add_multiply gemm_add_multiply.cpp) +target_link_libraries(client_gemm_add_multiply PRIVATE composable_kernel::device_operations) \ No newline at end of file diff --git a/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp b/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp new file mode 100644 index 0000000000000000000000000000000000000000..28524a9eee9b87db4223484497f8e2181ed366ea --- /dev/null +++ b/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/gemm_add_multiply.hpp" + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using AddMultiply = ck::tensor_operation::element_wise::AddMultiply; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = AddMultiply; + +using ADataType = F16; +using BDataType = F16; +using D0DataType = F16; +using D1DataType = F16; +using EDataType = F16; + +using ALayout = Row; +using BLayout = Col; +using D0Layout = Row; +using D1Layout = Row; +using ELayout = Row; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideD0 = 0; + ck::index_t StrideD1 = 4096; + ck::index_t StrideE = 4096; + + if(argc == 1) + { + // use default case + } + else if(argc == 9) + { + M = std::stoi(argv[1]); + N = std::stoi(argv[2]); + K = std::stoi(argv[3]); + + StrideA = std::stoi(argv[4]); + StrideB = std::stoi(argv[5]); + StrideD0 = std::stoi(argv[6]); + StrideD1 = std::stoi(argv[7]); + StrideE = std::stoi(argv[8]); + } + else + { + printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n"); + exit(0); + } + + auto f_matrix_space_size = + [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) { + using Layout = decltype(layout); + + if constexpr(std::is_same::value) + { + return (nRow - 1) * stride + nCol; + } + else + { + return (nCol - 1) * stride + nRow; + } + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{})); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{})); + SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) * + f_matrix_space_size(M, N, StrideD0, D0Layout{})); + SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) * + f_matrix_space_size(M, N, StrideD1, D1Layout{})); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{})); + + using DeviceOp = + ck::tensor_operation::device::DeviceGemmMultipleD, + ELayout, + ADataType, + BDataType, + ck::Tuple, + EDataType, + AElementOp, + BElementOp, + CDEElementOp>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d0_m_n_device_buf.GetDeviceBuffer(), + d1_m_n_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + std::array{StrideD0, StrideD1}, + StrideE, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer( + a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d0_m_n_device_buf.GetDeviceBuffer(), + d1_m_n_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + std::array{StrideD0, StrideD1}, + StrideE, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/15_reduce/CMakeLists.txt b/client_example/15_reduce/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d52675ba834e666d0a480d5c778baeacb223a24a --- /dev/null +++ b/client_example/15_reduce/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_reduce_nhwc_c reduce_nhwc_c.cpp) +target_link_libraries(client_reduce_nhwc_c PRIVATE composable_kernel::device_operations) diff --git a/client_example/15_reduce/reduce_nhwc_c.cpp b/client_example/15_reduce/reduce_nhwc_c.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2275158bcb26d36d871c9e7086b45d4539584785 --- /dev/null +++ b/client_example/15_reduce/reduce_nhwc_c.cpp @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp" + +using InDataType = float; +using OutDataType = float; +using AccDataType = float; +using ReduceAdd = ck::reduce::Add; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using UnaryDivide = ck::tensor_operation::element_wise::UnaryDivide; + +constexpr bool PropagateNan = false; +constexpr bool OutputIndex = false; + +constexpr int Rank = 4; +constexpr int NumReduceDim = 3; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array in_lengths{16, 8, 128, 256}; + std::array in_strides{8 * 128 * 256, 128 * 256, 256, 1}; + std::array out_lengths{256}; + std::array out_strides{1}; + std::array reduce_dims{0, 1, 2}; + + ck::index_t num_in_elements = + std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies()); + + ck::index_t num_out_elements = + std::accumulate(out_lengths.begin(), out_lengths.end(), 1, std::multiplies()); + + ck::index_t reduce_length = 1; + + for(auto dim : reduce_dims) + reduce_length *= in_lengths[dim]; + + double alpha{1.0}; + double beta{0.0}; + + SimpleDeviceMem in(sizeof(InDataType) * num_in_elements); + SimpleDeviceMem out(sizeof(OutDataType) * num_out_elements); + + using DeviceOp = ck::tensor_operation::device::DeviceReduce; + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + out_lengths, + out_strides, + reduce_dims, + alpha, + beta, + in.GetDeviceBuffer(), + nullptr, + out.GetDeviceBuffer(), + nullptr, + PassThrough{}, + UnaryDivide{reduce_length}); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = num_in_elements * sizeof(InDataType) + + (beta == 0.0f ? 1 : 2) * num_out_elements * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + if(found) + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + out_lengths, + out_strides, + reduce_dims, + alpha, + beta, + in.GetDeviceBuffer(), + nullptr, + out.GetDeviceBuffer(), + nullptr, + PassThrough{}, + UnaryDivide{reduce_length}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/16_convnd_fwd/CMakeLists.txt b/client_example/16_convnd_fwd/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2580a370ca480e0b9ec101e5715ac9a631a72f6 --- /dev/null +++ b/client_example/16_convnd_fwd/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(client_conv3d_fwd_fp16 conv3d_fwd_fp16.cpp) +add_executable(client_conv3d_fwd_fp32 conv3d_fwd_fp32.cpp) + +target_link_libraries(client_conv3d_fwd_fp16 PRIVATE composable_kernel::device_operations) +target_link_libraries(client_conv3d_fwd_fp32 PRIVATE composable_kernel::device_operations) diff --git a/client_example/16_convnd_fwd/common.hpp b/client_example/16_convnd_fwd/common.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a6bb5aa65be098716b504ae9d0543ed9ae129dc0 --- /dev/null +++ b/client_example/16_convnd_fwd/common.hpp @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +template +std::size_t +GetFlops(const std::array& output_lengths, + const std::array& weights_lengths) +{ + // 2 * G * N * K * C * * + ck::index_t G = weights_lengths[0]; + ck::index_t N = output_lengths[1]; + ck::index_t K = weights_lengths[1]; + ck::index_t C = weights_lengths[2]; + + return static_cast(2) * G * N * K * C * + std::accumulate(std::next(std::begin(output_lengths), NumNonSpatialDim), + std::end(output_lengths), + static_cast(1), + std::multiplies<>()) * + std::accumulate(std::next(std::begin(weights_lengths), NumNonSpatialDim), + std::end(weights_lengths), + static_cast(1), + std::multiplies<>()); +} + +template +std::size_t +GetInputByte(const std::array& input_lengths) +{ + // sizeof(InDataType) * (G * N * C * ) + + return sizeof(InDataType) * std::accumulate(std::begin(input_lengths), + std::end(input_lengths), + static_cast(1), + std::multiplies<>()); +} + +template +std::size_t +GetWeightByte(const std::array& weights_lengths) +{ + // sizeof(WeiDataType) * (G * K * C * ) + + return sizeof(WeiDataType) * std::accumulate(std::begin(weights_lengths), + std::end(weights_lengths), + static_cast(1), + std::multiplies<>()); +} + +template +std::size_t +GetOutputByte(const std::array& output_lengths) +{ + // sizeof(OutDataType) * (G * N * K * ); + return sizeof(OutDataType) * std::accumulate(std::begin(output_lengths), + std::end(output_lengths), + static_cast(1), + std::multiplies()); +} + +template +bool run_grouped_conv_fwd(std::array in_lengths, + std::array wei_lengths, + std::array out_lengths) +{ + std::size_t in_mem_size = GetInputByte(in_lengths); + std::size_t wei_mem_size = GetWeightByte(wei_lengths); + std::size_t out_mem_size = GetOutputByte(out_lengths); + + SimpleDeviceMem in(in_mem_size); + SimpleDeviceMem wei(wei_mem_size); + SimpleDeviceMem out(out_mem_size); + + std::array in_strides; + std::array wei_strides; + std::array out_strides; + in_strides.fill(0); + wei_strides.fill(0); + out_strides.fill(0); + in_strides.back() = 1; + wei_strides.back() = 1; + out_strides.back() = 1; + + std::partial_sum(rbegin(in_lengths), + std::prev(rend(in_lengths)), + std::next(rbegin(in_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(wei_lengths), + std::prev(rend(wei_lengths)), + std::next(rbegin(wei_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(out_lengths), + std::prev(rend(out_lengths)), + std::next(rbegin(out_strides)), + std::multiplies<>{}); + + // transpose NDHWGC/KZYXGC/NDHWGK to GNDHWC/GKZYXC/GNDHWK to GNCDHW/GKCZYX/GNKDHW + std::rotate(std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 2), rend(in_lengths)); + std::rotate(rbegin(in_lengths), + std::next(rbegin(in_lengths)), + std::next(rbegin(in_lengths), NumDimSpatial + 1)); + + std::rotate(std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 2), rend(in_strides)); + std::rotate(rbegin(in_strides), + std::next(rbegin(in_strides)), + std::next(rbegin(in_strides), NumDimSpatial + 1)); + + std::rotate( + std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 2), rend(wei_lengths)); + std::rotate(rbegin(wei_lengths), + std::next(rbegin(wei_lengths)), + std::next(rbegin(wei_lengths), NumDimSpatial + 1)); + + std::rotate( + std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 2), rend(wei_strides)); + std::rotate(rbegin(wei_strides), + std::next(rbegin(wei_strides)), + std::next(rbegin(wei_strides), NumDimSpatial + 1)); + + std::rotate( + std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 2), rend(out_lengths)); + std::rotate(rbegin(out_lengths), + std::next(rbegin(out_lengths)), + std::next(rbegin(out_lengths), NumDimSpatial + 1)); + + std::rotate( + std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 2), rend(out_strides)); + std::rotate(rbegin(out_strides), + std::next(rbegin(out_strides)), + std::next(rbegin(out_strides), NumDimSpatial + 1)); + + std::array conv_filter_strides; + std::array conv_filter_dilations; + std::array input_left_pads; + std::array input_right_pads; + conv_filter_strides.fill(1); + conv_filter_dilations.fill(1); + input_left_pads.fill(1); + input_right_pads.fill(1); + + std::size_t flop = GetFlops(out_lengths, wei_lengths); + std::size_t num_bytes = in_mem_size + wei_mem_size + out_mem_size; + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + PassThrough, + PassThrough, + PassThrough>; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer( + in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + std::array{}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + std::array, 0>{{}}, + std::array, 0>{{}}, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cerr << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance" << std::endl; + return false; + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer( + in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + std::array{}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + std::array, 0>{{}}, + std::array, 0>{{}}, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + return true; +} diff --git a/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp b/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..10f914bbee3fe22cd0c3f3cfc289382f2f58ca39 --- /dev/null +++ b/client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = ck::half_t; +using WeiDataType = ck::half_t; +using OutDataType = ck::half_t; + +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::KZYXGC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +static constexpr ck::index_t NumDimSpatial = 3; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 64; +static constexpr ck::index_t K = 128; +static constexpr ck::index_t C = 64; +static constexpr ck::index_t Z = 3; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Di = 28; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 3; +static constexpr ck::index_t Do = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 3; + +int main() +{ + return run_grouped_conv_fwd( + {N, Di, Hi, Wi, G, C}, {K, Z, Y, X, G, C}, {N, Do, Ho, Wo, G, K}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp b/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp new file mode 100644 index 0000000000000000000000000000000000000000..43c98f1e9b8ae8b89893d5e430d4f4ab89678803 --- /dev/null +++ b/client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = float; +using WeiDataType = float; +using OutDataType = float; + +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::KZYXGC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +static constexpr ck::index_t NumDimSpatial = 3; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 64; +static constexpr ck::index_t K = 128; +static constexpr ck::index_t C = 64; +static constexpr ck::index_t Z = 3; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Di = 28; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 3; +static constexpr ck::index_t Do = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 3; + +int main() +{ + return run_grouped_conv_fwd( + {N, Di, Hi, Wi, G, C}, {K, Z, Y, X, G, C}, {N, Do, Ho, Wo, G, K}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..14c066e4a21ca145eb5ddff3e28f6bd3a45b2449 --- /dev/null +++ b/client_example/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 3.15) +project(ck_app) +add_compile_options(-std=c++17) + +find_package(composable_kernel 1.0.0 COMPONENTS device_operations) +find_package(hip REQUIRED PATHS /opt/rocm) +message(STATUS "Build with HIP ${hip_VERSION}") + +# add all example subdir +file(GLOB dir_list LIST_DIRECTORIES true *) +FOREACH(subdir ${dir_list}) + IF(IS_DIRECTORY "${subdir}" AND (NOT "${subdir}" MATCHES "build")) + add_subdirectory(${subdir}) + ENDIF() +ENDFOREACH() diff --git a/client_example/README.md b/client_example/README.md new file mode 100644 index 0000000000000000000000000000000000000000..64a7130d537b1e2fb8752c4031e8430d11a6a46a --- /dev/null +++ b/client_example/README.md @@ -0,0 +1,21 @@ +## +Client application links to CK library, and therefore CK library needs to be installed before building client applications. + + +## Build +```bash +mkdir -p client_example/build +cd client_example/build +``` + +```bash +cmake \ +-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ +-D CMAKE_PREFIX_PATH="/opt/rocm;${PATH_TO_CK_INSTALL_DIRECTORY}" \ +.. +``` + +### Build client example +```bash + make -j +``` diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake index 9f193b2090441399dd11f86dbe95e374656f65e9..87bcb08e83057b4f162870706a420a2d88d468bf 100644 --- a/cmake/EnableCompilerWarnings.cmake +++ b/cmake/EnableCompilerWarnings.cmake @@ -65,8 +65,9 @@ else() -Wuninitialized -Wunreachable-code -Wunused - - -Wno-sign-compare + -Wno-reserved-identifier + -Werror + -Wsign-compare -Wno-extra-semi-stmt ) if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang") diff --git a/cmake/TargetFlags.cmake b/cmake/TargetFlags.cmake new file mode 100644 index 0000000000000000000000000000000000000000..4f83fb5d396881a84cd2aeac7f91716966a02ab6 --- /dev/null +++ b/cmake/TargetFlags.cmake @@ -0,0 +1,50 @@ + +function(get_target_property2 VAR TARGET PROPERTY) + get_target_property(_pflags ${TARGET} ${PROPERTY}) + if(_pflags) + set(${VAR} ${_pflags} PARENT_SCOPE) + else() + set(${VAR} "" PARENT_SCOPE) + endif() +endfunction() + + +macro(append_flags FLAGS TARGET PROPERTY PREFIX) + get_target_property2(_pflags ${TARGET} ${PROPERTY}) + foreach(FLAG ${_pflags}) + if(TARGET ${FLAG}) + target_flags(_pflags2 ${FLAG}) + string(APPEND ${FLAGS} " ${_pflags2}") + else() + string(APPEND ${FLAGS} " ${PREFIX}${FLAG}") + endif() + endforeach() +endmacro() + +macro(append_link_flags FLAGS TARGET PROPERTY) + get_target_property2(_pflags ${TARGET} ${PROPERTY}) + foreach(FLAG ${_pflags}) + if(TARGET ${FLAG}) + target_flags(_pflags2 ${FLAG}) + string(APPEND ${FLAGS} " ${_pflags2}") + elseif(FLAG MATCHES "^-.*") + string(APPEND ${FLAGS} " ${FLAG}") + elseif(EXISTS ${FLAG}) + string(APPEND ${FLAGS} " ${FLAG}") + else() + string(APPEND ${FLAGS} " -l${FLAG}") + endif() + endforeach() +endmacro() + +function(target_flags FLAGS TARGET) + set(_flags) + append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "") + append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D") + append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ") + append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ") + append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "") + append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "") + # message("_flags: ${_flags}") + set(${FLAGS} ${_flags} PARENT_SCOPE) +endfunction() diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake new file mode 100644 index 0000000000000000000000000000000000000000..3c6cb56ccea3490c6d7a4eaf6107749dafa87bdf --- /dev/null +++ b/cmake/googletest.cmake @@ -0,0 +1,49 @@ +include(FetchContent) + +set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against") + +if(GOOGLETEST_DIR) + set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override") +endif() + +message(STATUS "Fetching GoogleTest") + +list(APPEND GTEST_CMAKE_CXX_FLAGS + -Wno-undef + -Wno-reserved-identifier + -Wno-global-constructors + -Wno-missing-noreturn + -Wno-disabled-macro-expansion + -Wno-used-but-marked-unused + -Wno-switch-enum + -Wno-zero-as-null-pointer-constant + -Wno-unused-member-function + -Wno-comma + -Wno-old-style-cast + -Wno-deprecated +) +message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}") + +FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG b85864c64758dec007208e56af933fc3f52044ee +) + +# Will be necessary for windows build +# set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_GetProperties(googletest) +if(NOT googletest_POPULATED) + FetchContent_Populate(googletest) + add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL) +endif() + +target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) +target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) +target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) +target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS}) + +set_target_properties(gtest PROPERTIES POSITION_INDEPENDENT_CODE ON) +set_target_properties(gtest_main PROPERTIES POSITION_INDEPENDENT_CODE ON) +set_target_properties(gmock PROPERTIES POSITION_INDEPENDENT_CODE ON) +set_target_properties(gmock_main PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/composable_kernel/include/gridwise_operation_wrapper.hpp b/composable_kernel/include/gridwise_operation_wrapper.hpp deleted file mode 100644 index 0a1e07ec57137ecd6238d609adb048151cf38496..0000000000000000000000000000000000000000 --- a/composable_kernel/include/gridwise_operation_wrapper.hpp +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER -#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER - -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - run_gridwise_operation(Xs... xs) -{ - GridwiseOp{}.Run(xs...); -} - -#endif diff --git a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp b/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp deleted file mode 100644 index 09ea16fa2398b073229e2dce7240f42345822b4a..0000000000000000000000000000000000000000 --- a/composable_kernel/include/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp +++ /dev/null @@ -1,272 +0,0 @@ -#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP -#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" - -namespace ck { - -// Number of GEMMs = YTilda * XTilda -// GemmM = C -// GemmN = N * HTildaSlice * WTildaSlice -// GemmK = K * YDotSlice * XDotSlice -template -__host__ __device__ constexpr auto -transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk( - const TensorDescriptor& wei_k_y_x_c_grid_desc, - const TensorDescriptor& out_n_ho_wo_k_grid_desc, - const TensorDescriptor& in_n_hi_wi_c_grid_desc, - const ConvStrides& conv_strides, - const ConvDilations& conv_dilations, - const InLeftPads& in_left_pads, - const InRightPads& in_right_pads, - Number, - Number, - Number) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto GemmK1 = Number{}; - constexpr auto IYTilda = Number{}; - constexpr auto IXTilda = Number{}; - - const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0); - const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3); - const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3); - - const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1); - const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2); - - const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1); - const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2); - - const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1); - const auto X = wei_k_y_x_c_grid_desc.GetLength(I2); - - const auto ConvStrideH = conv_strides[I0]; - const auto ConvStrideW = conv_strides[I1]; - - const auto ConvDilationH = conv_dilations[I0]; - const auto ConvDilationW = conv_dilations[I1]; - - const auto InLeftPadH = in_left_pads[I0]; - const auto InLeftPadW = in_left_pads[I1]; - - const auto InRightPadH = in_right_pads[I0]; - const auto InRightPadW = in_right_pads[I1]; - - const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH); - const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW); - - const auto YTilda = ConvStrideH / GcdStrideDilationH; - const auto XTilda = ConvStrideW / GcdStrideDilationW; - - const auto YDot = math::integer_divide_ceil(Y, YTilda); - const auto XDot = math::integer_divide_ceil(X, XTilda); - - const auto HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH); - const auto WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW); - - // only work on HTilda and WTilda that contribute to non-padding area of input tensor - const auto IHTildaSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadH - ConvDilationH * (YTilda - I1)), ConvStrideH); - const auto IWTildaSliceBegin = math::integer_divide_floor( - math::max(I0, InLeftPadW - ConvDilationW * (XTilda - I1)), ConvStrideW); - - const auto IHTildaSliceEnd = - math::min(HTilda, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1); - const auto IWTildaSliceEnd = - math::min(WTilda, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1); - - const auto HTildaSlice = IHTildaSliceEnd - IHTildaSliceBegin; - const auto WTildaSlice = IWTildaSliceEnd - IWTildaSliceBegin; - - // GemmK is different for each GEMM - const auto YDotSlice = math::integer_divide_ceil(Y - IYTilda, YTilda); - const auto XDotSlice = math::integer_divide_ceil(X - IXTilda, XTilda); - - const auto K1 = GemmK1; - const auto K0 = K / K1; - - // weight tensor - const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_tensor_descriptor( - wei_k_y_x_c_grid_desc, - make_tuple(make_pass_through_transform(K), - make_embed_transform(make_tuple(YDot, YTilda), - make_tuple(ConvStrideH / GcdStrideDilationH, I1)), - make_embed_transform(make_tuple(XDot, XTilda), - make_tuple(ConvStrideW / GcdStrideDilationW, I1)), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); - - const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc = - transform_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(K0, K1)), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_freeze_transform(IYTilda), - make_freeze_transform(IXTilda), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<3>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<0, 1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<>{}, - Sequence<>{}, - Sequence<4>{})); - -#if 1 - const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( - wei_k0_k1_ydotslice_xdotslice_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)), - make_pass_through_transform(C), - make_pass_through_transform(K1)), - make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); -#else - const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor( - wei_k0_k1_ydotslice_xdotslice_c_grid_desc, - make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)), - make_pass_through_transform(C), - make_pass_through_transform(K1)), - make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); -#endif - - // output tensor - // this add padding check - const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor( - out_n_ho_wo_k_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Ho, I0, I0), - make_pad_transform(Wo, I0, I0), - make_pass_through_transform(K)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_tensor_descriptor( - out_n_hop_wop_k_grid_desc, - make_tuple(make_pass_through_transform(N), - make_embed_transform(make_tuple(YDot, HTilda), - make_tuple(-ConvDilationH / GcdStrideDilationH, I1)), - make_embed_transform(make_tuple(XDot, WTilda), - make_tuple(-ConvDilationW / GcdStrideDilationW, I1)), - make_pass_through_transform(K)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); - - const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc = - transform_tensor_descriptor( - out_n_ydot_htilda_xdot_wtilda_k_grid_desc, - make_tuple(make_pass_through_transform(N), - make_slice_transform(YDot, I0, YDotSlice), - make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice), - make_slice_transform(XDot, I0, XDotSlice), - make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice), - make_unmerge_transform(make_tuple(K0, K1))), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{}, - Sequence<5, 6>{})); - -#if 1 - const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( - out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc, - make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)), - make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)), - make_pass_through_transform(K1)), - make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); -#else - const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor( - out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc, - make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)), - make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)), - make_pass_through_transform(K1)), - make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); -#endif - - // input tensor - const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor( - in_n_hi_wi_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Hi, InLeftPadH, InRightPadH), - make_pad_transform(Wi, InLeftPadW, InRightPadW), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_tensor_descriptor( - in_n_hip_wip_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_embed_transform(make_tuple(YTilda, HTilda), - make_tuple(ConvDilationH, ConvStrideH)), - make_embed_transform(make_tuple(XTilda, WTilda), - make_tuple(ConvDilationW, ConvStrideW)), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); - - const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_tensor_descriptor( - in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc, - make_tuple(make_pass_through_transform(N), - make_freeze_transform(IYTilda), - make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice), - make_freeze_transform(IXTilda), - make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<0>{}, - Sequence<>{}, - Sequence<1>{}, - Sequence<>{}, - Sequence<2>{}, - Sequence<3>{})); - - const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor( - in_n_htildaslice_wtildaslice_c_grid_desc, - make_tuple(make_pass_through_transform(C), - make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice))), - make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc, - out_gemmk0_gemmn_gemmk1_grid_desc, - in_gemmm_gemmn_grid_desc); -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp deleted file mode 100644 index 26ca0bf1115dc1e9767b3528c39f9af320fee0f2..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp +++ /dev/null @@ -1,410 +0,0 @@ -#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP -#define CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP - -#include "common_header.hpp" -#include "tensor_adaptor.hpp" -#include "threadwise_tensor_slice_transfer_v2.hpp" -#include "threadwise_contraction_dlops.hpp" - -namespace ck { - -// C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1] -// A and B are visable to the whole block, C is distributed among each thread -// Assume: -// 1. A: -// 1. ABlockDesc_BK0_BM_BK1 is known at compile-time -// 2. ABlockBuffer is DynamicBuffer -// 2. B: -// 1. BBlockDesc_BK0_BN_BK1 is known at compile-time -// 2. BBlockBuffer is DynamicBuffer -// 3. C: -// 1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time -// 2. CThreadBuffer is StaticBuffer -// Also assume: -// BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2 -// BM0 = BN0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization) -template - typename BM10BN10ThreadClusterBN10Xs, // Sequence - index_t AThreadCopyScalarPerVector_BM11, - index_t BThreadCopyScalarPerVector_BN11, - typename enable_if::type = false> -struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2 -{ - using AIndex = MultiIndex<3>; - using BIndex = MultiIndex<3>; - using CIndex = MultiIndex<4>; - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - - static constexpr index_t BK0 = ABlockDesc_BK0_BM_BK1{}.GetLength(I0); - static constexpr index_t BK1 = ABlockDesc_BK0_BM_BK1{}.GetLength(I2); - static constexpr index_t BM = ABlockDesc_BK0_BM_BK1{}.GetLength(I1); - static constexpr index_t BN = BBlockDesc_BK0_BN_BK1{}.GetLength(I1); - - static constexpr index_t BM100 = BM10BN10ThreadClusterBM10Xs{}[I0]; - static constexpr index_t BN100 = BM10BN10ThreadClusterBN10Xs{}[I0]; - - static constexpr index_t BM101 = BM10BN10ThreadClusterBM10Xs{}[I1]; - static constexpr index_t BN101 = BM10BN10ThreadClusterBN10Xs{}[I1]; - - static constexpr index_t BM11 = BM1PerThreadBM11; - static constexpr index_t BN11 = BN1PerThreadBN11; - - static constexpr index_t BM1 = BM100 * BM101 * BM11; - static constexpr index_t BN1 = BN100 * BN101 * BN11; - - static constexpr index_t BM0 = BM / BM1; - static constexpr index_t BN0 = BN / BN1; - - __host__ __device__ static constexpr auto - MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1) - { - const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor( - a_block_desc_bk0_bm_bk1, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform(make_tuple(Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); - - return a_block_bk0_bm0_bm1_bk1; - } - - __host__ __device__ static constexpr auto - MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1) - { - const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor( - b_block_desc_bk0_bn_bk1, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform(make_tuple(Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); - - return b_block_desc_bk0_bn0_bn1_bk1; - } - - __host__ __device__ static constexpr auto - MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN() - { - // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11] - // lower: [BM, BN] - constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n = - make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple( - Number{}, Number{}, Number{}, Number{})), - make_unmerge_transform(make_tuple( - Number{}, Number{}, Number{}, Number{}))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{})); - - return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n; - } - - __host__ __device__ static constexpr auto - MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1() - { - // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11] - // lower: [BM0, BM1, BN0, BN1] - constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1 = - make_single_stage_tensor_adaptor( - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{}))), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{})); - - return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1; - } - - __host__ __device__ static constexpr auto GetCThreadTensorLengths_BM0_BM1_BN0_BN1() - { - return Sequence{}; - } - - static constexpr auto a_block_desc_bk0_bm0_bm1_bk1_ = - MakeABlockDescriptor_BK0_BM0_BM1_BK1(ABlockDesc_BK0_BM_BK1{}); - - static constexpr auto b_block_desc_bk0_bn0_bn1_bk1_ = - MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{}); - - public: - __device__ BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2() - : c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1( - get_thread_local_1d_id())}, - a_thread_copy_{ - make_tuple(0, c_thread_origin_data_idx_[I0], c_thread_origin_data_idx_[I1], 0)}, - b_thread_copy_{ - make_tuple(0, c_thread_origin_data_idx_[I2], c_thread_origin_data_idx_[I3], 0)} - { - static_assert(ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() && - BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - static_assert(BlockSize == BM101 * BM100 * BN101 * BN100, - "wrong! blocksize and cluster size not consistent"); - - static_assert(BM % BM1 == 0 && BN % BN1 == 0, "wrong!"); - - static_assert(ABlockDesc_BK0_BM_BK1{}.GetLength(I0) == - BBlockDesc_BK0_BN_BK1{}.GetLength(I0), - "wrong! K dimension not consistent"); - - // TODO remove this restriction - static_assert(BM10BN10ThreadClusterBM10Xs::Size() == 2 && - BM10BN10ThreadClusterBN10Xs::Size() == 2, - "wrong!"); - - // TODO: remove this restriction - static_assert(BM0 == 2 && BN0 == 2, "wrong"); - } - - __device__ static CIndex CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(index_t thread_id) - { - // lower: [BM0, BM1, BN0, BN1] - // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11] - constexpr auto adaptor0 = - MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1(); - - // lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11] - // upper: [Tid, BM0, BM11, BN0, BN11] - constexpr auto adaptor1 = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(BM100, BN100, BM101, BN101)), - make_pass_through_transform(BM0), - make_pass_through_transform(BM11), - make_pass_through_transform(BN0), - make_pass_through_transform(BN11)), - make_tuple( - Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); - - constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1); - - return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0)); - } - - template - __device__ void Run(const CThreadDesc_BM0_BM11_BN0_BN11&, - const ABlockBuffer& a_block_buf, - const BBlockBuffer& b_block_buf, - CThreadBuffer& c_thread_buf) const - { - static_assert(CThreadDesc_BM0_BM11_BN0_BN11::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - // TODO: remove this restriction - static_assert(BM0 == 2 && BN0 == 2 && - CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I0) == BM0 && - CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I2) == BN0, - "wrong"); - - auto a_thread_buf = make_static_buffer( - a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( - b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize()); - - constexpr auto threadwise_contraction = - ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1< - FloatA, - FloatB, - FloatC, - decltype(a_thread_desc_bk0_bm0_bm1_bk1_), - decltype(b_thread_desc_bk0_bn0_bn1_bk1_), - CThreadDesc_BM0_BM11_BN0_BN11, - Sequence, - Sequence<1, BM1PerThreadBM11>, - Sequence<1, BN1PerThreadBN11>>{}; - - // read A_sub_0 - a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_, - make_tuple(I0, I0, I0, I0), - a_block_buf, - a_thread_desc_bk0_bm0_bm1_bk1_, - make_tuple(I0, I0, I0, I0), - a_thread_buf); - - // read B_sub_0 - b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_, - make_tuple(I0, I0, I0, I0), - b_block_buf, - b_thread_desc_bk0_bn0_bn1_bk1_, - make_tuple(I0, I0, I0, I0), - b_thread_buf); - - // read B_sub_1 - b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_, - make_tuple(I0, I1, I0, I0), - b_block_buf, - b_thread_desc_bk0_bn0_bn1_bk1_, - make_tuple(I0, I1, I0, I0), - b_thread_buf); - - // read A_sub_1 - a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_, - make_tuple(I0, I1, I0, I0), - a_block_buf, - a_thread_desc_bk0_bm0_bm1_bk1_, - make_tuple(I0, I1, I0, I0), - a_thread_buf); - - // C_sub_00 += transpose(A_sub_0) * B_sub_0 - threadwise_contraction.Run(a_thread_buf, - make_tuple(I0, I0, I0, I0), - b_thread_buf, - make_tuple(I0, I0, I0, I0), - c_thread_buf, - make_tuple(I0, I0, I0, I0)); - - // C_sub_01 += transpose(A_sub_0) * B_sub_1 - threadwise_contraction.Run(a_thread_buf, - make_tuple(I0, I0, I0, I0), - b_thread_buf, - make_tuple(I0, I1, I0, I0), - c_thread_buf, - make_tuple(I0, I0, I1, I0)); - - // loop over rest of bk0 - static_for{}([&](auto bk0) { - // read A_sub_0 - a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_, - make_tuple(bk0, I0, I0, I0), - a_block_buf, - a_thread_desc_bk0_bm0_bm1_bk1_, - make_tuple(I0, I0, I0, I0), - a_thread_buf); - - // C_sub_10 += transpose(A_sub_1) * B_sub_0 - threadwise_contraction.Run(a_thread_buf, - make_tuple(I0, I1, I0, I0), - b_thread_buf, - make_tuple(I0, I0, I0, I0), - c_thread_buf, - make_tuple(I1, I0, I0, I0)); - - // read B_sub_0 - b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_, - make_tuple(bk0, I0, I0, I0), - b_block_buf, - b_thread_desc_bk0_bn0_bn1_bk1_, - make_tuple(I0, I0, I0, I0), - b_thread_buf); - - // C_sub_11 += transpose(A_sub_1) * B_sub_1 - threadwise_contraction.Run(a_thread_buf, - make_tuple(I0, I1, I0, I0), - b_thread_buf, - make_tuple(I0, I1, I0, I0), - c_thread_buf, - make_tuple(I1, I0, I1, I0)); - - // read B_sub_1 - b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_, - make_tuple(bk0, I1, I0, I0), - b_block_buf, - b_thread_desc_bk0_bn0_bn1_bk1_, - make_tuple(I0, I1, I0, I0), - b_thread_buf); - - // read A_sub_1 - a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_, - make_tuple(bk0, I1, I0, I0), - a_block_buf, - a_thread_desc_bk0_bm0_bm1_bk1_, - make_tuple(I0, I1, I0, I0), - a_thread_buf); - - // C_sub_00 += transpose(A_sub_0) * B_sub_0 - threadwise_contraction.Run(a_thread_buf, - make_tuple(I0, I0, I0, I0), - b_thread_buf, - make_tuple(I0, I0, I0, I0), - c_thread_buf, - make_tuple(I0, I0, I0, I0)); - - // C_sub_01 += transpose(A_sub_0) * B_sub_1 - threadwise_contraction.Run(a_thread_buf, - make_tuple(I0, I0, I0, I0), - b_thread_buf, - make_tuple(I0, I1, I0, I0), - c_thread_buf, - make_tuple(I0, I0, I1, I0)); - }); - - // C_sub_10 += transpose(A_sub_1) * B_sub_0 - threadwise_contraction.Run(a_thread_buf, - make_tuple(I0, I1, I0, I0), - b_thread_buf, - make_tuple(I0, I0, I0, I0), - c_thread_buf, - make_tuple(I1, I0, I0, I0)); - - // C_sub_11 += transpose(A_sub_1) * B_sub_1 - threadwise_contraction.Run(a_thread_buf, - make_tuple(I0, I1, I0, I0), - b_thread_buf, - make_tuple(I0, I1, I0, I0), - c_thread_buf, - make_tuple(I1, I0, I1, I0)); - } - - private: - // A[BK0, BM0, BM1, BK1] - static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ = - make_naive_tensor_descriptor_packed(make_tuple( - Number{}, Number{}, Number{}, Number{})); - - // B[BK0, BN0, BN1, BK1] - static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ = - make_naive_tensor_descriptor_packed(make_tuple( - Number{}, Number{}, Number{}, Number{})); - - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1< - FloatA, - FloatA, - decltype(a_block_desc_bk0_bm0_bm1_bk1_), - decltype(a_thread_desc_bk0_bm0_bm1_bk1_), - Sequence, // SliceLengths - Sequence<0, 1, 2, 3>, // DimAccessOrder - Sequence<1, 1, BM1PerThreadBM11, BK1>, // SrcVectorTensorLengths - Sequence<0, 1, 2, 3>>; // SrcVectorTensorContiguousDimOrder - - using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1< - FloatB, - FloatB, - decltype(b_block_desc_bk0_bn0_bn1_bk1_), - decltype(b_thread_desc_bk0_bn0_bn1_bk1_), - Sequence, // SliceLengths - Sequence<0, 1, 2, 3>, // DimAccessOrder - Sequence<1, 1, BN1PerThreadBN11, BK1>, // SrcVectorTensorLengths - Sequence<0, 1, 2, 3>>; // SrcVectorTensorContiguousDimOrder - - CIndex c_thread_origin_data_idx_; - - AThreadCopy a_thread_copy_; - BThreadCopy b_thread_copy_; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp deleted file mode 100644 index 5cc2f2393ee86543b9b535382824265db17beb93..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp +++ /dev/null @@ -1,183 +0,0 @@ -#ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP -#define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP - -#include "common_header.hpp" -#include "threadwise_gemm_dlops_v3.hpp" - -namespace ck { - -template -struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 -{ - struct MatrixIndex - { - index_t k; - index_t h; - index_t w; - }; - - // HACK: fix this @Jing Zhang - static constexpr index_t KPerThreadSubC = 4; - - static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - - static constexpr auto b_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple( - Number{}, Number<1>{}, Number{}, Number{})); - - static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple( - Number{}, Number<1>{}, Number{}, Number{})); - - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1>, - 1, - ThreadGemmADataPerRead_K, - 1>; - - __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3() - : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())}, - a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread)} - { - static_assert(BlockMatrixA::IsKnownAtCompileTime() && - BlockMatrixB::IsKnownAtCompileTime() && - ThreadMatrixC::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0), - "wrong! K dimension not consistent\n"); - - constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed - constexpr index_t H = BlockMatrixB{}.GetLength(I2); - constexpr index_t W = BlockMatrixB{}.GetLength(I3); - - static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0, - "wrong! Cannot evenly divide work among\n"); - - constexpr auto KThreadCluster = K / KPerThread; - constexpr auto HThreadCluster = H / HPerThread; - constexpr auto WThreadCluster = W / WPerThread; - - static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster, - "wrong! wrong blocksize\n"); - } - - __device__ static constexpr auto GetThreadMatrixCLengths() - { - return Sequence{}; - } - - __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) - { - constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{}); - constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{}); - - constexpr auto num_w_threads = W / WPerThread; - constexpr auto num_h_threads = H / HPerThread; - constexpr auto num_hw_threads = num_w_threads * num_h_threads; - - index_t k_thread_id = thread_id / num_hw_threads; - index_t hw_thread_id = thread_id % num_hw_threads; - - index_t h_thread_id = hw_thread_id / num_w_threads; - index_t w_thread_id = hw_thread_id % num_w_threads; - - return MatrixIndex{k_thread_id, h_thread_id, w_thread_id}; - } - - template - __device__ void Run(const ABlockBuffer& a_block_buf, - const BThreadBuffer& b_thread_buf, - CThreadBuffer& c_thread_buf) const - { - static_assert( - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value && - "wrong! inconsistent type"); - - constexpr auto I0 = Number<0>{}; - - constexpr auto a_block_mtx = BlockMatrixA{}; - - constexpr auto EPerBlock = a_block_mtx.GetLength(I0); - - // HACK: fix this @Jing Zhang - constexpr auto HoPerThreadSubC = 2; - constexpr auto WoPerThreadSubC = 2; - - static_assert(KPerThread % KPerThreadSubC == 0, ""); - static_assert(HPerThread % HoPerThreadSubC == 0, ""); - static_assert(WPerThread % WoPerThreadSubC == 0, ""); - - // thread A buffer for GEMM - StaticBuffer - a_thread_buf; - - constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3{}; - - static_for<0, EPerBlock, EPerThreadLoop>{}([&](auto e_begin) { - static_for<0, KPerThread, KPerThreadSubC>{}([&](auto k_begin) { - a_thread_copy_.Run(a_block_mtx, - make_tuple(e_begin, k_begin), - a_block_buf, - a_thread_mtx_, - make_tuple(I0, I0), - a_thread_buf); - - static_for<0, HPerThread, HoPerThreadSubC>{}([&](auto h_begin) { - static_for<0, WPerThread, WoPerThreadSubC>{}([&](auto w_begin) { - threadwise_gemm.Run(a_thread_buf, - make_tuple(I0, I0), - b_thread_buf, - make_tuple(e_begin, I0, h_begin, w_begin), - c_thread_buf, - make_tuple(k_begin, I0, h_begin, w_begin)); - }); - }); - }); - }); - } - - template - __device__ void MoveASliceWindow(const BlockMatrixA&, - const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx) - { - a_thread_copy_.MoveSrcSliceWindow(BlockMatrixA{}, a_block_slice_move_step_idx); - } - - private: - MatrixIndex c_thread_begin_mtx_idx_; - - AThreadCopy a_thread_copy_; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp deleted file mode 100644 index 36c6783204276cbd98ab228d1b8bd40ee64df07d..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp +++ /dev/null @@ -1,282 +0,0 @@ -#ifndef CK_BLOCKWISE_GEMM_XDLOPS_HPP -#define CK_BLOCKWISE_GEMM_XDLOPS_HPP - -#include "common_header.hpp" -#include "threadwise_tensor_slice_transfer.hpp" -#include "xdlops_gemm.hpp" -#include "tensor_adaptor.hpp" - -namespace ck { - -template -struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - - static constexpr index_t WaveSize = 64; - - static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); - static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); - - static constexpr index_t K0 = BK0NK1BlockDesc{}.GetLength(I0); - - static constexpr auto xdlops_gemm = XdlopsGemm{}; - - static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL); - static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL); - - StaticBufferV2, MRepeat * NRepeat, true> - c_thread_buf_; - - __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; } - - __device__ static auto GetWaveIdx() - { - const index_t thread_id = get_thread_local_1d_id(); - - constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id)); - } - - __device__ static auto CalculateAThreadOriginDataIndex() - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_m = wave_idx[I0]; - - const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex(); - - return make_tuple(xdlops_a_idx[I0], 0, waveId_m, xdlops_a_idx[I1], 0); - } - - __device__ static auto CalculateBThreadOriginDataIndex() - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_n = wave_idx[I1]; - - const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex(); - - return make_tuple(xdlops_b_idx[I0], 0, waveId_n, xdlops_b_idx[I1], 0); - } - - template - __device__ static auto - CalculateCThreadOriginDataIndex(Number, Number, Number, Number) - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_m = wave_idx[I0]; - const auto waveId_n = wave_idx[I1]; - - const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i); - - constexpr auto mrepeat_mwave_mperxdl_to_m_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerXDL))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1, 2>{})); - - constexpr auto nrepeat_nwave_nperxdl_to_n_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerXDL))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1, 2>{})); - - const index_t c_thread_m = mrepeat_mwave_mperxdl_to_m_adaptor.CalculateBottomIndex( - make_tuple(m0, waveId_m, blk_idx[I0]))[I0]; - const index_t c_thread_n = nrepeat_nwave_nperxdl_to_n_adaptor.CalculateBottomIndex( - make_tuple(n0, waveId_n, blk_idx[I1]))[I0]; - - return make_tuple(c_thread_m, c_thread_n); - } - - __host__ __device__ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1() - { - static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && - BK0NK1BlockDesc::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - static_assert(AK0MK1BlockDesc{}.GetLength(I0) == BK0NK1BlockDesc{}.GetLength(I0), - "wrong! K0 dimension not consistent"); - - static_assert(AK0MK1BlockDesc{}.GetLength(I2) == BK0NK1BlockDesc{}.GetLength(I2), - "wrong! K1 dimension not consistent"); - - static_assert(BlockSize == MWaves * NWaves * WaveSize, - "BlockSize != MWaves * NWaves * WaveSize\n"); - - static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0, - "wrong!"); - } - - __host__ __device__ static constexpr auto GetCM0N0M1N1M2M3M4N2ThreadDescriptor() - { - constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths(); - - constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0]; - constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1]; - constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2]; - constexpr auto N = c_m0_m1_m2_n_tblk_lens[I3]; - - return make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, I1, M0, M1, M2, N)); - } - - __host__ __device__ static constexpr auto GetCM0N0M1N1M2M3M4N2BlockDescriptor() - { - constexpr auto c_m0_n0_m1_n1_m2_n2_block_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, - Number{}, - Number{}, - Number{}, - Number{}, - Number{})); - - return xdlops_gemm.MakeCM0N0M1N1M2M3M4N2Descriptor(c_m0_n0_m1_n1_m2_n2_block_desc); - } - - template - __host__ __device__ static constexpr auto - MakeCM0N0M1N1M2M3M4N2GridDescriptor(const CMNGridDesc& c_m_n_grid_desc) - { - const auto c_m0_n0_m1_n1_m2_n2_grid_desc = transform_tensor_descriptor( - c_m_n_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerXDL)), - make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerXDL))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{})); - - return xdlops_gemm.MakeCM0N0M1N1M2M3M4N2Descriptor(c_m0_n0_m1_n1_m2_n2_grid_desc); - } - - __host__ __device__ static constexpr auto MakeAK0M0M1M2K1BlockDescriptor() - { - return transform_tensor_descriptor( - AK0MK1BlockDesc{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - __host__ __device__ static constexpr auto MakeBK0N0N1N2K1BlockDescriptor() - { - return transform_tensor_descriptor( - BK0NK1BlockDesc{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - static constexpr auto a_k0_m0_m1_m2_k1_block_desc = MakeAK0M0M1M2K1BlockDescriptor(); - static constexpr auto b_k0_n0_n1_n2_k1_block_desc = MakeBK0N0N1N2K1BlockDescriptor(); - - template - __device__ void Run(const ABlockBuffer& a_block_buf, - const BBlockBuffer& b_block_buf, - CThreadBuffer& c_thread_buf) const - { - auto a_thread_buf = make_static_buffer( - a_thread_desc_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( - b_thread_desc_.GetElementSpaceSize()); - - static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A - a_thread_copy_.Run(a_k0_m0_m1_m2_k1_block_desc, - make_tuple(I0, m0, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, I0, I0, I0, I0), - a_thread_buf); - - static_for<0, NRepeat, 1>{}([&](auto n0) { - // read B - b_thread_copy_.Run(b_k0_n0_n1_n2_k1_block_desc, - make_tuple(I0, n0, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, I0, I0, I0, I0), - b_thread_buf); - - static_for<0, K0, xdlops_gemm.K0PerXdlops>{}([&](auto k0) { - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, K1, 1>{}([&](auto i) { - a_thread_vec.template AsType()(i) = a_thread_buf - [Number{}]; - b_thread_vec.template AsType()(i) = b_thread_buf - [Number{}]; - }); - - using mfma_input_type = - typename vector_type::type; - - constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0)); - - xdlops_gemm.template Run(a_thread_vec.template AsType(), - b_thread_vec.template AsType(), - c_thread_buf.GetVector(Number{})); - }); - }); - }); - } - - private: - // A[K, M] - static constexpr auto a_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, I1, I1, I1, Number{})); - - // B[K, N] - static constexpr auto b_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, I1, I1, I1, Number{})); - - static constexpr auto c_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{})); - - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - K1, - K1>; - - using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - K1, - K1>; - - AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; - BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp deleted file mode 100644 index 0214b713522b8d14b7c2f1cbae20d5912c3a9534..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP -#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "cluster_descriptor.hpp" -#include "threadwise_tensor_slice_transfer.hpp" - -namespace ck { - -// this version does following things to avoid scratch memory issue -// 1. Use StaticallyIndexedArray instead of C array for thread buffer -// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor -// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate -template -struct BlockwiseTensorSliceTransfer_v4 -{ - static constexpr index_t nDim = remove_reference_t::GetNumOfDimension(); - - using Index = MultiIndex; - - __device__ constexpr BlockwiseTensorSliceTransfer_v4(const SrcDesc& src_desc, - const Index& src_block_slice_origin, - const DstDesc& dst_desc, - const Index& dst_block_slice_origin) - : threadwise_transfer_( - src_desc, make_zero_multi_index(), dst_desc, make_zero_multi_index()) - - { - static_assert(nDim == remove_reference_t>::GetNumOfDimension() && - nDim == remove_reference_t>::GetNumOfDimension() && - nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() && - nDim == ThreadClusterLengths::Size() && - nDim == ThreadClusterArrangeOrder::Size() && - nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(), - "wrong! nDim not consistent"); - - static_assert( - is_same{}, - "wrong! threads should be mapped to cover entire slicing window"); - - static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(), - "wrong! BlockSize too small"); - - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex( - make_multi_index(get_thread_local_1d_id())); - - const auto thread_data_idx_begin = thread_cluster_idx * ThreadSliceLengths{}; - - threadwise_transfer_.SetSrcSliceOrigin(src_desc, - src_block_slice_origin + thread_data_idx_begin); - threadwise_transfer_.SetDstSliceOrigin(dst_desc, - dst_block_slice_origin + thread_data_idx_begin); - } - } - - template - __device__ void - RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks); - } - } - - template - __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.RunRead(src_desc, src_buf); - } - } - - template - __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.RunWrite(dst_desc, dst_buf); - } - } - - __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.MoveSrcSliceWindow(src_desc, step); - } - } - - // SrcMoveSliceWindowStepHack to control index calculation move slice window - template - __device__ void - MoveSrcSliceWindow(const SrcDesc& src_desc, - const Index& step, - const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.MoveSrcSliceWindow( - src_desc, step, src_move_slice_window_step_hack); - } - } - - __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.MoveDstSliceWindow(dst_desc, step); - } - } - - private: - static constexpr auto thread_cluster_desc_ = - make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); - - using ThreadwiseTransfer = - ThreadwiseTensorSliceTransfer_v3; - - ThreadwiseTransfer threadwise_transfer_; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp deleted file mode 100644 index 6b2d2d523198e13ead7a8cda91f09893a72b5701..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp +++ /dev/null @@ -1,156 +0,0 @@ -#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP -#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V2_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "cluster_descriptor.hpp" -#include "threadwise_tensor_slice_transfer_v2.hpp" - -namespace ck { - -// this version does following things to avoid scratch memory issue -// 1. Use StaticallyIndexedArray instead of C array for thread buffer -// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor -// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate -template -struct BlockwiseTensorSliceTransfer_v4r1 -{ - static constexpr index_t nDim = remove_reference_t::GetNumOfDimension(); - - using Index = MultiIndex; - - __device__ constexpr BlockwiseTensorSliceTransfer_v4r1(const SrcDesc& src_desc, - const Index& src_block_slice_origin, - const DstDesc& dst_desc, - const Index& dst_block_slice_origin) - : threadwise_transfer_( - src_desc, make_zero_multi_index(), dst_desc, make_zero_multi_index()) - - { - static_assert(nDim == remove_reference_t>::GetNumOfDimension() && - nDim == remove_reference_t>::GetNumOfDimension() && - nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() && - nDim == ThreadClusterLengths::Size() && - nDim == ThreadClusterArrangeOrder::Size() && - nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(), - "wrong! nDim not consistent"); - - static_assert( - is_same{}, - "wrong! threads should be mapped to cover entire slicing window"); - - static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(), - "wrong! BlockSize too small"); - - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex( - make_multi_index(get_thread_local_1d_id())); - - const auto thread_data_idx_begin = thread_cluster_idx * ThreadSliceLengths{}; - - threadwise_transfer_.SetSrcSliceOrigin(src_desc, - src_block_slice_origin + thread_data_idx_begin); - threadwise_transfer_.SetDstSliceOrigin(dst_desc, - dst_block_slice_origin + thread_data_idx_begin); - } - } - - template - __device__ void - RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks); - } - } - - template - __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.RunWrite(dst_desc, dst_buf); - } - } - - __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.MoveSrcSliceWindow(src_desc, step); - } - } - - // SrcMoveSliceWindowStepHack to control index calculation move slice window - template - __device__ void - MoveSrcSliceWindow(const SrcDesc& src_desc, - const Index& step, - const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.MoveSrcSliceWindow( - src_desc, step, src_move_slice_window_step_hack); - } - } - - __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step) - { - if(BlockSize == thread_cluster_desc_.GetElementSize() or - get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize()) - { - threadwise_transfer_.MoveDstSliceWindow(dst_desc, step); - } - } - - private: - static constexpr auto thread_cluster_desc_ = - make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); - - using ThreadwiseTransfer = - ThreadwiseTensorSliceTransfer_v3r1; - - ThreadwiseTransfer threadwise_transfer_; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp deleted file mode 100644 index 2653dd43401012375c21660af88e36a868e90f96..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp +++ /dev/null @@ -1,650 +0,0 @@ -#ifndef CK_GRIDWISE_GEMM_V1R3_HPP -#define CK_GRIDWISE_GEMM_V1R3_HPP - -#include "common_header.hpp" -#include "multi_index_transform_helper.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "blockwise_gemm_dlops_v2r3.hpp" -#include "blockwise_tensor_slice_transfer_v2.hpp" -#include "threadwise_tensor_slice_transfer_v2.hpp" -#include "threadwise_tensor_slice_set.hpp" - -namespace ck { - -#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_gemm_dlops_v1r3( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const AK0M0M1K1GridDesc a_k0_m0_m1_k1_grid_desc, - const BK0N0N1K1GridDesc b_k0_n0_n1_k1_grid_desc, - const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc, - const CBlockIdToM0N0BlockClusterAdaptor c_blockid_to_m0_n0_block_cluster_adaptor) -{ - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseGemm::Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_k0_m0_m1_k1_grid_desc, - b_k0_n0_n1_k1_grid_desc, - c_m0_m10_m11_n0_n10_n11_grid_desc, - c_blockid_to_m0_n0_block_cluster_adaptor, - integral_constant{}, - integral_constant{}); -} -#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER -// pass tensor descriptor by CONSTANT void pointer -// CONSTANT is needed to inform compiler void pointers in the kernel signature are pointing to -// non-modifiable parameter address space, so compiler can enable corresponding optimization -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_gemm_dlops_v1r3(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc, - const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc, - const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) -{ - // first cast void CONSTANT void* to void* - // second cast void* to Desc* - // the copy constructor of tensor descriptor doesn't take address_space(4) - const auto a_k0_m0_m1_k1_grid_desc = *reinterpret_cast( - cast_pointer_to_generic_address_space(p_a_k0_m0_m1_k1_grid_desc)); - const auto b_k0_n0_n1_k1_grid_desc = *reinterpret_cast( - cast_pointer_to_generic_address_space(p_b_k0_n0_n1_k1_grid_desc)); - const auto c_m0_m10_m11_n0_n10_n11_grid_desc = - *reinterpret_cast( - cast_pointer_to_generic_address_space(p_c_m0_m10_m11_n0_n10_n11_grid_desc)); - const auto c_blockid_to_m0_n0_block_cluster_adaptor = - *reinterpret_cast( - cast_pointer_to_generic_address_space(p_c_blockid_to_m0_n0_block_cluster_adaptor)); - - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseGemm::Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_k0_m0_m1_k1_grid_desc, - b_k0_n0_n1_k1_grid_desc, - c_m0_m10_m11_n0_n10_n11_grid_desc, - c_blockid_to_m0_n0_block_cluster_adaptor, - integral_constant{}, - integral_constant{}); -} -#endif - -template -struct GridwiseGemmDlops_km_kn_mn_v1r3 -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - - // K1 should be Number<...> - static constexpr auto K1 = AK0MK1GridDesc{}.GetLength(I2); - - __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() - { - // TODO: change this. I think it needs multi-dimensional alignment - constexpr auto max_lds_align = K1; - - // TODO: check alignment - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - - // TODO: check alignment - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - - // TODO: check alignment - // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_aligned_space_size = - math::integer_least_multiple(a_k_m_block_desc.GetElementSpaceSize(), max_lds_align); - - constexpr auto b_block_aligned_space_size = - math::integer_least_multiple(b_k_n_block_desc.GetElementSpaceSize(), max_lds_align); - - return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB); - } - - __host__ __device__ static constexpr bool - CheckValidity(const AK0MK1GridDesc& a_k0_m_k1_grid_desc, - const BK0NK1GridDesc& b_k0_n_k1_grid_desc, - const CMNGridDesc& c_m_n_grid_desc) - { - const auto M = a_k0_m_k1_grid_desc.GetLength(I1); - const auto N = b_k0_n_k1_grid_desc.GetLength(I1); - const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0); - - // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) - - return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) && - K0 == b_k0_n_k1_grid_desc.GetLength(I0) && - K1 == a_k0_m_k1_grid_desc.GetLength(I2) && - K1 == b_k0_n_k1_grid_desc.GetLength(I2)) && - (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K0 % KPerBlock == 0); - } - - __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N) - { - const index_t grid_size = (M / MPerBlockM1) * (N / NPerBlockN1); - - return grid_size; - } - - __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K0) - { - const bool has_main_k_block_loop = (K0 + KPerBlock) / (2 * KPerBlock) > 1; - - return has_main_k_block_loop; - } - - __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K0) - { - const bool has_double_tail_k_block_loop = (K0 / KPerBlock) % 2 == 0; - - return has_double_tail_k_block_loop; - } - - __host__ __device__ static constexpr auto - MakeAK0M0M1K1GridDescriptor(const AK0MK1GridDesc& a_k0_m_k1_grid_desc) - { - const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0); - const auto M = a_k0_m_k1_grid_desc.GetLength(I1); - - const auto M1 = Number{}; - const auto M0 = M / M1; - - const auto a_k0_m0_m1_k1_grid_desc = - transform_tensor_descriptor(a_k0_m_k1_grid_desc, - make_tuple(make_pass_through_transform(K0), - make_unmerge_transform(make_tuple(M0, M1)), - make_pass_through_transform(K1)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); - - return a_k0_m0_m1_k1_grid_desc; - } - - __host__ __device__ static constexpr auto - MakeBK0N0N1K1GridDescriptor(const BK0NK1GridDesc& b_k0_n_k1_grid_desc) - { - const auto K0 = b_k0_n_k1_grid_desc.GetLength(I0); - const auto N = b_k0_n_k1_grid_desc.GetLength(I1); - - const auto N1 = Number{}; - const auto N0 = N / N1; - - const auto b_k0_n0_n1_k1_grid_desc = - transform_tensor_descriptor(b_k0_n_k1_grid_desc, - make_tuple(make_pass_through_transform(K0), - make_unmerge_transform(make_tuple(N0, N1)), - make_pass_through_transform(K1)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); - - return b_k0_n0_n1_k1_grid_desc; - } - - __host__ __device__ static constexpr auto - MakeCM0M10M11N0N10N11GridDescriptor(const CMNGridDesc& c_m_n_grid_desc) - { - const auto M = c_m_n_grid_desc.GetLength(I0); - const auto N = c_m_n_grid_desc.GetLength(I1); - - constexpr auto M1 = Number{}; - constexpr auto N1 = Number{}; - - const auto M0 = M / M1; - const auto N0 = N / N1; - - constexpr auto M11 = - Number{}; - constexpr auto N11 = - Number{}; - - constexpr auto M10 = M1 / M11; - constexpr auto N10 = N1 / N11; - - const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor( - c_m_n_grid_desc, - make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)), - make_unmerge_transform(make_tuple(N0, N10, N11))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return c_m0_m10_m11_n0_n10_n11_grid_desc; - } - - __host__ __device__ static constexpr auto - MakeCBlockIdToM0N0BlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc) - { - const auto M = c_m_n_grid_desc.GetLength(I0); - const auto N = c_m_n_grid_desc.GetLength(I1); - - constexpr auto M1 = Number{}; - constexpr auto N1 = Number{}; - - const auto M0 = M / M1; - const auto N0 = N / N1; - - const auto c_blockid_to_m0_n0_block_cluster_adaptor = - make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))), - make_tuple(Sequence<0, 1>{}), - make_tuple(Sequence<0>{})); - - return c_blockid_to_m0_n0_block_cluster_adaptor; - } - - using AK0M0M1K1GridDesc = decltype(MakeAK0M0M1K1GridDescriptor(AK0MK1GridDesc{})); - using BK0N0N1K1GridDesc = decltype(MakeBK0N0N1K1GridDescriptor(BK0NK1GridDesc{})); - using CM0M10M11N0N10N11GridDesc = decltype(MakeCM0M10M11N0N10N11GridDescriptor(CMNGridDesc{})); - using CBlockIdToM0N0BlockClusterAdaptor = - decltype(MakeCBlockIdToM0N0BlockClusterAdaptor(CMNGridDesc{})); - - template - __device__ static void - Run(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - FloatAB* __restrict__ p_shared_block, - const AK0M0M1K1GridDesc& a_k0_m0_m1_k1_grid_desc, - const BK0N0N1K1GridDesc& b_k0_n0_n1_k1_grid_desc, - const CM0M10M11N0N10N11GridDesc& c_m0_m10_m11_n0_n10_n11_grid_desc, - const CBlockIdToM0N0BlockClusterAdaptor& c_blockid_to_m0_n0_block_cluster_adaptor, - integral_constant, - integral_constant) - { - const auto a_global_buf = make_dynamic_buffer( - p_a_grid, a_k0_m0_m1_k1_grid_desc.GetElementSpaceSize()); - const auto b_global_buf = make_dynamic_buffer( - p_b_grid, b_k0_n0_n1_k1_grid_desc.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_m0_m10_m11_n0_n10_n11_grid_desc.GetElementSpaceSize()); - - // divide block work by [M, N] - const auto c_m0_n0_block_cluster_idx = - c_blockid_to_m0_n0_block_cluster_adaptor.CalculateBottomIndex( - make_multi_index(get_block_1d_id())); - - // HACK: this force index data into SGPR - const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]); - const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]); - - // TODO: change this. I think it needs multi-dimensional alignment - constexpr auto max_lds_align = K1; - - // TODO: check alignment - // A matrix in LDS memory, dst of blockwise copy - // be careful of LDS alignment - constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, I1, Number{}, K1), max_lds_align); - - // TODO: check alignment - // B matrix in LDS memory, dst of blockwise copy - // be careful of LDS alignment - constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, I1, Number{}, K1), max_lds_align); - - // TODO: check alignment - // A matrix in LDS memory, for blockwise GEMM - constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - - // TODO: check alignment - // B matrix in LDS memory, for blockwise GEMM - constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - - static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() == - a_k0_m_k1_block_desc.GetElementSpaceSize() && - b_k0_n0_n1_k1_block_desc.GetElementSpaceSize() == - b_k0_n_k1_block_desc.GetElementSpaceSize() && - "wrong!"); - - // A matrix blockwise copy - auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1< - BlockSize, - InMemoryDataOperationEnum_t::Set, - Sequence, - ABlockTransferThreadSliceLengths_K0_M0_M1_K1, - ABlockTransferThreadClusterLengths_K0_M0_M1_K1, - ABlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(a_k0_m0_m1_k1_grid_desc), - decltype(a_k0_m0_m1_k1_block_desc), - ABlockTransferSrcAccessOrder, - Sequence<0, 1, 2, 3>, - ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, // SrcVectorTensorLengths - ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, // DstVectorTensorLengths - ABlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder - Sequence<0, 1, 2, 3>, // DstVectorTensorContiguousDimOrder - false, - true>(a_k0_m0_m1_k1_grid_desc, - make_multi_index(0, im0, 0, 0), - a_k0_m0_m1_k1_block_desc, - make_multi_index(0, 0, 0, 0)); - - // B matrix blockwise copy - auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v4r1< - BlockSize, - InMemoryDataOperationEnum_t::Set, - Sequence, - BBlockTransferThreadSliceLengths_K0_N0_N1_K1, - BBlockTransferThreadClusterLengths_K0_N0_N1_K1, - BBlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(b_k0_n0_n1_k1_grid_desc), - decltype(b_k0_n0_n1_k1_block_desc), - BBlockTransferSrcAccessOrder, - Sequence<0, 1, 2, 3>, - BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, // SrcVectorTensorLengths - BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, // DstVectorTensorLengths - BBlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder - Sequence<0, 1, 2, 3>, // DstVectorTensorContiguousDimOrder - false, - true>(b_k0_n0_n1_k1_grid_desc, - make_multi_index(0, in0, 0, 0), - b_k0_n0_n1_k1_block_desc, - make_multi_index(0, 0, 0, 0)); - - // GEMM definition - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[KPerBlock, MPerBlockM1] is in LDS - // b_mtx[KPerBlocl, NPerBlockN1] is in LDS - // c_mtx[MPerBlockM1, NPerBlockN1] is distributed among threads, and saved in - // register - const auto blockwise_gemm = - BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< - BlockSize, - FloatAB, - FloatAB, - FloatAcc, - decltype(a_k0_m_k1_block_desc), - decltype(b_k0_n_k1_block_desc), - M1PerThreadM111, - N1PerThreadN111, - KPerThread, - M11N11ThreadClusterM110Xs, - M11N11ThreadClusterN110Xs, - M1PerThreadM111, - N1PerThreadN111>{}; - - constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths = - decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1(); - - constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed( - sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths)); - - // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_aligned_space_size = math::integer_least_multiple( - a_k0_m0_m1_k1_block_desc.GetElementSpaceSize(), max_lds_align); - - constexpr auto b_block_aligned_space_size = math::integer_least_multiple( - b_k0_n0_n1_k1_block_desc.GetElementSpaceSize(), max_lds_align); - - FloatAB* p_a_block_double = p_shared_block; - FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size; - - // register allocation for output - auto c_thread_buf = make_static_buffer( - c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize()); - - ThreadwiseTensorSliceSet_v1{} - .Run(c_m10_m11_n10_n11_thread_desc, - make_tuple(I0, I0, I0, I0), - c_thread_buf, - FloatAcc{0}); - - constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0, 0); - - auto a_block_even_buf = make_dynamic_buffer( - p_a_block_double, a_k0_m0_m1_k1_block_desc.GetElementSpaceSize()); - auto b_block_even_buf = make_dynamic_buffer( - p_b_block_double, b_k0_n0_n1_k1_block_desc.GetElementSpaceSize()); - - auto a_block_odd_buf = make_dynamic_buffer( - p_a_block_double + a_block_aligned_space_size, - a_k0_m0_m1_k1_block_desc.GetElementSpaceSize()); - auto b_block_odd_buf = make_dynamic_buffer( - p_b_block_double + b_block_aligned_space_size, - b_k0_n0_n1_k1_block_desc.GetElementSpaceSize()); - - // LDS double buffer: preload data into LDS - { - a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{}); - b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{}); - - a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_even_buf); - b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_even_buf); - } - - if constexpr(HasMainKBlockLoop) - { - const auto K0 = a_k0_m0_m1_k1_grid_desc.GetLength(I0); - - index_t k_block_data_begin = 0; - - // LDS double buffer: main body - // use Do-While loop instead of For loop to simplify control flow - do - { - // even iteration - a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc, - a_block_slice_copy_step, - AGridMoveSliceWindowStepHacks{}); - b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc, - b_block_slice_copy_step, - BGridMoveSliceWindowStepHacks{}); - - __syncthreads(); - - // LDS doubel buffer: load next data from device mem - a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{}); - b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{}); - - // LDS double buffer: GEMM on current data - blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc, - a_block_even_buf, - b_block_even_buf, - c_thread_buf); - - // LDS double buffer: store next data to LDS - a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_odd_buf); - b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_odd_buf); - - // odd iteration - a_blockwise_copy.MoveSrcSliceWindow(a_k0_m0_m1_k1_grid_desc, - a_block_slice_copy_step, - AGridMoveSliceWindowStepHacks{}); - b_blockwise_copy.MoveSrcSliceWindow(b_k0_n0_n1_k1_grid_desc, - b_block_slice_copy_step, - BGridMoveSliceWindowStepHacks{}); - - __syncthreads(); - - // LDS doubel buffer: load next data from device mem - a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{}); - b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{}); - - // LDS double buffer: GEMM on current data - blockwise_gemm.Run( - c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf); - - // LDS double buffer: store next data to LDS - a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_even_buf); - b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_even_buf); - - k_block_data_begin += 2 * KPerBlock; - } while(k_block_data_begin < K0 - 2 * KPerBlock); - } - - // LDS double buffer: tail - if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left - { - a_blockwise_copy.MoveSrcSliceWindow( - a_k0_m0_m1_k1_grid_desc, a_block_slice_copy_step, AGridMoveSliceWindowStepHacks{}); - b_blockwise_copy.MoveSrcSliceWindow( - b_k0_n0_n1_k1_grid_desc, b_block_slice_copy_step, BGridMoveSliceWindowStepHacks{}); - - __syncthreads(); - - // LDS double buffer: load last data from device mem - a_blockwise_copy.RunRead(a_k0_m0_m1_k1_grid_desc, a_global_buf, AGridStepHacks{}); - b_blockwise_copy.RunRead(b_k0_n0_n1_k1_grid_desc, b_global_buf, BGridStepHacks{}); - - // LDS double buffer: GEMM on 2nd-last data - blockwise_gemm.Run( - c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf); - - // LDS double buffer: store last data to LDS - a_blockwise_copy.RunWrite(a_k0_m0_m1_k1_block_desc, a_block_odd_buf); - b_blockwise_copy.RunWrite(b_k0_n0_n1_k1_block_desc, b_block_odd_buf); - - __syncthreads(); - - // LDS double buffer: GEMM on last data - blockwise_gemm.Run( - c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf); - } - else // if has 1 iteration left - { - __syncthreads(); - - // LDS double buffer: GEMM on last data - blockwise_gemm.Run( - c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf); - } - - // output: register to global memory - { - constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc = - make_naive_tensor_descriptor_packed( - make_tuple(I1, - Number{}, - Number{}, - I1, - Number{}, - Number{})); - - const auto c_m10_m11_n10_n11_thread_origin_idx_on_block = - blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1( - get_thread_local_1d_id()); - - ThreadwiseTensorSliceTransfer_v1r3< - FloatAcc, - FloatC, - decltype(c_m0_m10_m11_n0_n10_n11_thread_desc), - decltype(c_m0_m10_m11_n0_n10_n11_grid_desc), - Sequence<1, - c_m10_m11_n10_n11_thread_tensor_lengths[I0], - c_m10_m11_n10_n11_thread_tensor_lengths[I1], - 1, - c_m10_m11_n10_n11_thread_tensor_lengths[I2], - c_m10_m11_n10_n11_thread_tensor_lengths[I3]>, - CThreadTransferSrcDstAccessOrder, - CThreadTransferSrcDstVectorDim, - CThreadTransferDstScalarPerVector, - CGlobalMemoryDataOperation, - 1, - true>{c_m0_m10_m11_n0_n10_n11_grid_desc, - make_multi_index(im0, - c_m10_m11_n10_n11_thread_origin_idx_on_block[I0], - c_m10_m11_n10_n11_thread_origin_idx_on_block[I1], - in0, - c_m10_m11_n10_n11_thread_origin_idx_on_block[I2], - c_m10_m11_n10_n11_thread_origin_idx_on_block[I3])} - .Run(c_m0_m10_m11_n0_n10_n11_thread_desc, - make_tuple(I0, I0, I0, I0, I0, I0), - c_thread_buf, - c_m0_m10_m11_n0_n10_n11_grid_desc, - c_grid_buf, - CGridStepHacks{}); - } - } -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp deleted file mode 100644 index 86e047c965a2743fae00d35378e2ad029f01d263..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp +++ /dev/null @@ -1,639 +0,0 @@ -#ifndef CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP -#define CK_GRIDWISE_GEMM_XDLOPS_V2R3_HPP - -#include "common_header.hpp" -#include "multi_index_transform_helper.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "blockwise_gemm_xdlops.hpp" -#include "blockwise_tensor_slice_transfer.hpp" -#include "threadwise_tensor_slice_transfer.hpp" -#include "threadwise_tensor_slice_set.hpp" - -namespace ck { - -#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const AK0MK1GridDesc a_k0_m_k1_grid_desc, - const BK0NK1GridDesc b_k0_n_k1_grid_desc, - const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - const CBlockClusterAdaptor c_block_cluster_adaptor) -{ - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseGemm::template Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_k0_m_k1_grid_desc, - b_k0_n_k1_grid_desc, - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - c_block_cluster_adaptor); -} -#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m_k1_grid_desc, - const void CONSTANT* p_b_k0_n_k1_grid_desc, - const void CONSTANT* p_c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - const void CONSTANT* p_c_block_cluster_adaptor) -{ - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - const auto a_k0_m_k1_grid_desc = *reinterpret_cast( - cast_pointer_to_generic_address_space(p_a_k0_m_k1_grid_desc)); - const auto b_k0_n_k1_grid_desc = *reinterpret_cast( - cast_pointer_to_generic_address_space(p_b_k0_n_k1_grid_desc)); - const auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc = - *reinterpret_cast( - cast_pointer_to_generic_address_space(p_c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc)); - const auto c_block_cluster_adaptor = *reinterpret_cast( - cast_pointer_to_generic_address_space(p_c_block_cluster_adaptor)); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseGemm::template Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_k0_m_k1_grid_desc, - b_k0_n_k1_grid_desc, - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - c_block_cluster_adaptor); -} -#endif - -template -struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; - static constexpr auto I5 = Number<5>{}; - static constexpr auto I6 = Number<6>{}; - static constexpr auto I7 = Number<7>{}; - - // K1 should be Number<...> - static constexpr auto K1 = Number{}; - - __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() - { - constexpr auto max_lds_align = K1; - - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_k0_m_k1_block_desc = [&]() { - if constexpr(ABlockLdsExtraM) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_k0_n_k1_block_desc = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_space_size = - math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align); - - constexpr auto b_block_space_size = - math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align); - - return (a_block_space_size + b_block_space_size) * sizeof(FloatAB); - } - - // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} - __host__ __device__ static constexpr bool - CheckValidity(const AK0MK1GridDesc& a_k0_m_k1_grid_desc, - const BK0NK1GridDesc& b_k0_n_k1_grid_desc, - const CMNGridDesc& c_m_n_grid_desc, - index_t M01, - index_t N01) - { - static_assert(is_known_at_compile_time>::value, - "wrong! K1 need to be known at compile-time"); - - static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) && - (NPerBlock % (NRepeat * NPerXDL)) == 0, - "Invalid tuning param!"); - - const auto M = a_k0_m_k1_grid_desc.GetLength(I1); - const auto N = b_k0_n_k1_grid_desc.GetLength(I1); - const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0); - - if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) && - K0 == b_k0_n_k1_grid_desc.GetLength(I0) && K1 == a_k0_m_k1_grid_desc.GetLength(I2) && - K1 == b_k0_n_k1_grid_desc.GetLength(I2))) - return false; - - if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0)) - return false; - - // check M01, N01 - constexpr auto M1 = Number{}; - constexpr auto N1 = Number{}; - - const auto M0 = M / M1; - const auto N0 = N / N1; - - if(!(M0 % M01 == 0 && N0 % N01 == 0)) - return false; - - // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) - return true; - } - - __host__ __device__ static constexpr index_t - CalculateGridSize(const CMNGridDesc& c_m_n_grid_desc) - { - const auto M = c_m_n_grid_desc.GetLength(I0); - const auto N = c_m_n_grid_desc.GetLength(I1); - - const index_t grid_size = (M / MPerBlock) * (N / NPerBlock); - - return grid_size; - } - - __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0) - { - const bool has_main_k0_block_loop = (K0 / K0PerBlock) > 1; - - return has_main_k0_block_loop; - } - - __host__ __device__ static constexpr auto - MakeCM0N0M1N1M2M3M4N2GridDescriptor(const CMNGridDesc& c_m_n_grid_desc) - { - constexpr auto max_lds_align = K1; - - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_k0_m_k1_block_desc = [&]() { - if constexpr(ABlockLdsExtraM) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_k0_n_k1_block_desc = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - using BlockwiseGemm = - BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1; - - return BlockwiseGemm::MakeCM0N0M1N1M2M3M4N2GridDescriptor(c_m_n_grid_desc); - } - - // return block_id to C matrix tile idx (m0, n0) mapping - __host__ __device__ static constexpr auto - MakeCBlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc, index_t M01, index_t N01) - { - const auto M = c_m_n_grid_desc.GetLength(I0); - const auto N = c_m_n_grid_desc.GetLength(I1); - - constexpr auto M1 = Number{}; - constexpr auto N1 = Number{}; - - const auto M0 = M / M1; - const auto N0 = N / N1; - - const auto M00 = M0 / M01; - const auto N00 = N0 / N01; - - const auto m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(M00, M01)), - make_unmerge_transform(make_tuple(N00, N01))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{})); - - const auto c_blockid_to_m00_m01_n00_n01_block_cluster_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(M00, N00, M01, N01))), - make_tuple(Sequence<0, 1, 2, 3>{}), - make_tuple(Sequence<0>{})); - - const auto c_blockid_to_m0_n0_block_cluster_adaptor = - chain_tensor_adaptors(m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor, - c_blockid_to_m00_m01_n00_n01_block_cluster_adaptor); - - return c_blockid_to_m0_n0_block_cluster_adaptor; - } - - using CM0N0M1N1M2M3M4N2GridDesc = decltype(MakeCM0N0M1N1M2M3M4N2GridDescriptor(CMNGridDesc{})); - using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1)); - - template - __device__ static void Run(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - FloatAB* __restrict__ p_shared_block, - const AK0MK1GridDesc& a_k0_m_k1_grid_desc, - const BK0NK1GridDesc& b_k0_n_k1_grid_desc, - const CM0N0M1N1M2M3M4N2GridDesc& c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - const CBlockClusterAdaptor& c_block_cluster_adaptor) - { - const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( - p_b_grid, b_k0_n_k1_grid_desc.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc.GetElementSpaceSize()); - - const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0); - - // divide block work by [M, N] - const auto block_work_idx = - c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id())); - - // HACK: this force m/n_block_data_idx_on_grid into SGPR - const index_t m_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); - - const index_t n_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); - - // lds max alignment - constexpr auto max_lds_align = K1; - - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_k0_m_k1_block_desc = [&]() { - if constexpr(ABlockLdsExtraM) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_k0_n_k1_block_desc = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // A matrix blockwise copy - auto a_blockwise_copy = - BlockwiseTensorSliceTransfer_v4, - ABlockTransferThreadSliceLengths_K0_M_K1, - ABlockTransferThreadClusterLengths_K0_M_K1, - ABlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(a_k0_m_k1_grid_desc), - decltype(a_k0_m_k1_block_desc), - ABlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, - ABlockTransferSrcVectorDim, - 2, - ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_K1, - 1, - 1, - AThreadTransferSrcResetCoordinateAfterRun, - true>(a_k0_m_k1_grid_desc, - make_multi_index(0, m_block_data_idx_on_grid, 0), - a_k0_m_k1_block_desc, - make_multi_index(0, 0, 0)); - - // B matrix blockwise copy - auto b_blockwise_copy = - BlockwiseTensorSliceTransfer_v4, - BBlockTransferThreadSliceLengths_K0_N_K1, - BBlockTransferThreadClusterLengths_K0_N_K1, - BBlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(b_k0_n_k1_grid_desc), - decltype(b_k0_n_k1_block_desc), - BBlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, - BBlockTransferSrcVectorDim, - 2, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_K1, - 1, - 1, - BThreadTransferSrcResetCoordinateAfterRun, - true>(b_k0_n_k1_grid_desc, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b_k0_n_k1_block_desc, - make_multi_index(0, 0, 0)); - - // GEMM definition - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[K0PerBlock, MPerBlock] is in LDS - // b_mtx[K0PerBlock, NPerBlock] is in LDS - // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in - // register - // sanity check - - auto blockwise_gemm = - BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{}; - - auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); - - // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_space_size = - math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align); - - FloatAB* p_a_block = p_shared_block; - FloatAB* p_b_block = p_shared_block + a_block_space_size; - - constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - - // hack to control index calculation when iterating over A and B matrix for threadwise copy - constexpr auto a_k0_m_k1_grid_step_hacks = AGridStepHacks{}; - constexpr auto b_k0_n_k1_grid_step_hacks = BGridStepHacks{}; - - // hack to control index calculation when move slice window for A and B matrix for - // threadwise copy - constexpr auto a_k0_m_k1_grid_move_slice_window_step_hack = AGridMoveSliceWindowStepHacks{}; - constexpr auto b_k0_n_k1_grid_move_slice_window_step_hack = BGridMoveSliceWindowStepHacks{}; - - auto a_block_buf = make_dynamic_buffer( - p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( - p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize()); - - // preload data into LDS - { - a_blockwise_copy.RunRead(a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks); - b_blockwise_copy.RunRead(b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks); - - a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf); - b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf); - } - - // main body - index_t k0_block_data_begin = 0; - - if constexpr(HasMainKBlockLoop) - { - do - { - a_blockwise_copy.MoveSrcSliceWindow(a_k0_m_k1_grid_desc, - a_block_slice_copy_step, - a_k0_m_k1_grid_move_slice_window_step_hack); - b_blockwise_copy.MoveSrcSliceWindow(b_k0_n_k1_grid_desc, - b_block_slice_copy_step, - b_k0_n_k1_grid_move_slice_window_step_hack); - - a_blockwise_copy.RunRead( - a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks); - - block_sync_lds(); - - b_blockwise_copy.RunRead( - b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks); - - blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); - - block_sync_lds(); - - a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf); - b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf); - - k0_block_data_begin += K0PerBlock; - } while(k0_block_data_begin < (K0 - K0PerBlock)); - } - - // tail - { - block_sync_lds(); - - blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); - } - - // output: register to global memory - { - constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc = - blockwise_gemm.GetCM0N0M1N1M2M3M4N2BlockDescriptor(); - - constexpr auto M0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I0); - constexpr auto N0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I1); - constexpr auto M1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I2); - constexpr auto N1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I3); - constexpr auto M2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I4); - constexpr auto M3 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I5); - constexpr auto M4 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I6); - constexpr auto N2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I7); - - constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc = - make_naive_tensor_descriptor_packed(make_tuple( - Number{}, Number{}, I1, I1, Number{}, I1, Number{}, I1)); - - // calculate origin of thread output tensor on global memory - // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = - blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0); - - const index_t m_thread_data_on_grid = - m_block_data_idx_on_grid + c_thread_mtx_on_block[I0]; - - const index_t n_thread_data_on_grid = - n_block_data_idx_on_grid + c_thread_mtx_on_block[I1]; - - constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks = CGridStepHacks{}; - - const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))), - make_tuple(Sequence<0, 1, 2, 3, 4>{}), - make_tuple(Sequence<0>{})); - - const auto m_thread_data_on_grid_idx = - m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex( - make_multi_index(m_thread_data_on_grid)); - - const auto n_thread_data_on_grid_to_n0_n1_n2_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(N0, N1, N2))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - const auto n_thread_data_on_grid_idx = - n_thread_data_on_grid_to_n0_n1_n2_adaptor.CalculateBottomIndex( - make_multi_index(n_thread_data_on_grid)); - - auto c_thread_copy = - ThreadwiseTensorSliceTransfer_v1r3, - CThreadTransferSrcDstAccessOrder, - CThreadTransferSrcDstVectorDim, - CThreadTransferDstScalarPerVector, - CGlobalMemoryDataOperation, - 1, - true>{ - - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - make_multi_index(m_thread_data_on_grid_idx[I0], - n_thread_data_on_grid_idx[I0], - m_thread_data_on_grid_idx[I1], - n_thread_data_on_grid_idx[I1], - m_thread_data_on_grid_idx[I2], - m_thread_data_on_grid_idx[I3], - m_thread_data_on_grid_idx[I4], - n_thread_data_on_grid_idx[I2])}; - - c_thread_copy.Run(c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc, - make_tuple(I0, I0, I0, I0, I0, I0, I0, I0), - c_thread_buf, - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - c_grid_buf, - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks); - } - } -}; // namespace ck - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r4.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r4.hpp deleted file mode 100644 index f27fc73b3b57f36a3aaf7f1247dd759854741949..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r4.hpp +++ /dev/null @@ -1,680 +0,0 @@ -#ifndef CK_GRIDWISE_GEMM_XDLOPS_V2R4_HPP -#define CK_GRIDWISE_GEMM_XDLOPS_V2R4_HPP - -#include "common_header.hpp" -#include "multi_index_transform_helper.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "blockwise_gemm_xdlops.hpp" -#include "blockwise_tensor_slice_transfer.hpp" -#include "threadwise_tensor_slice_transfer.hpp" -#include "threadwise_tensor_slice_set.hpp" - -namespace ck { - -#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_gemm_xdlops_v2r4(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc, - const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc, - const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - const CBlockClusterAdaptor c_block_cluster_adaptor) -{ - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseGemm::template Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_b_k0_m_k1_grid_desc, - b_b_k0_n_k1_grid_desc, - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - c_block_cluster_adaptor); -} -#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_gemm_xdlops_v2r4(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_b_k0_m_k1_grid_desc, - const void CONSTANT* p_b_b_k0_n_k1_grid_desc, - const void CONSTANT* p_c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - const void CONSTANT* p_c_block_cluster_adaptor) -{ - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - const auto a_b_k0_m_k1_grid_desc = *reinterpret_cast( - cast_pointer_to_generic_address_space(p_a_b_k0_m_k1_grid_desc)); - const auto b_b_k0_n_k1_grid_desc = *reinterpret_cast( - cast_pointer_to_generic_address_space(p_b_b_k0_n_k1_grid_desc)); - const auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc = - *reinterpret_cast( - cast_pointer_to_generic_address_space(p_c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc)); - const auto c_block_cluster_adaptor = *reinterpret_cast( - cast_pointer_to_generic_address_space(p_c_block_cluster_adaptor)); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseGemm::template Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_b_k0_m_k1_grid_desc, - b_b_k0_n_k1_grid_desc, - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - c_block_cluster_adaptor); -} -#endif - -template -struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4 -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; - static constexpr auto I5 = Number<5>{}; - static constexpr auto I6 = Number<6>{}; - static constexpr auto I7 = Number<7>{}; - - // K1 should be Number<...> - static constexpr auto K1 = Number{}; - - __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() - { - constexpr auto max_lds_align = K1; - - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_k0_m_k1_block_desc = [&]() { - if constexpr(ABlockLdsExtraM) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_k0_n_k1_block_desc = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_space_size = - math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align); - - constexpr auto b_block_space_size = - math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align); - - return (a_block_space_size + b_block_space_size) * sizeof(FloatAB); - } - - // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} - __host__ __device__ static constexpr bool - CheckValidity(const ABK0MK1GridDesc& a_b_k0_m_k1_grid_desc, - const BBK0NK1GridDesc& b_b_k0_n_k1_grid_desc, - const CMNGridDesc& c_m_n_grid_desc, - index_t M01, - index_t N01) - { - static_assert(is_known_at_compile_time>::value, - "wrong! K1 need to be known at compile-time"); - - static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) && - (NPerBlock % (NRepeat * NPerXDL)) == 0, - "Invalid tuning param!"); - - const auto M = a_b_k0_m_k1_grid_desc.GetLength(I2); - const auto N = b_b_k0_n_k1_grid_desc.GetLength(I2); - const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1); - const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0); - - if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) && - K0 == b_b_k0_n_k1_grid_desc.GetLength(I1) && - K1 == a_b_k0_m_k1_grid_desc.GetLength(I3) && - K1 == b_b_k0_n_k1_grid_desc.GetLength(I3) && - KBatch == b_b_k0_n_k1_grid_desc.GetLength(I0))) - return false; - - if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0)) - return false; - - // check M01, N01 - constexpr auto M1 = Number{}; - constexpr auto N1 = Number{}; - - const auto M0 = M / M1; - const auto N0 = N / N1; - - if(!(M0 % M01 == 0 && N0 % N01 == 0)) - return false; - - // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) - return true; - } - - __host__ __device__ static constexpr index_t - CalculateGridSize(const CMNGridDesc& c_m_n_grid_desc, index_t KBatch) - { - const auto M = c_m_n_grid_desc.GetLength(I0); - const auto N = c_m_n_grid_desc.GetLength(I1); - - const index_t grid_size = (M / MPerBlock) * (N / NPerBlock) * KBatch; - - return grid_size; - } - - __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0) - { - const bool has_main_k0_block_loop = K0 > K0PerBlock; - - return has_main_k0_block_loop; - } - - __host__ __device__ static constexpr auto - MakeCM0N0M1N1M2M3M4N2GridDescriptor(const CMNGridDesc& c_m_n_grid_desc) - { - constexpr auto max_lds_align = K1; - - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_k0_m_k1_block_desc = [&]() { - if constexpr(ABlockLdsExtraM) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_k0_n_k1_block_desc = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - using BlockwiseGemm = - BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1; - - return BlockwiseGemm::MakeCM0N0M1N1M2M3M4N2GridDescriptor(c_m_n_grid_desc); - } - - // return block_id to C matrix tile idx (m0, n0) mapping - __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor( - const CMNGridDesc& c_m_n_grid_desc, index_t M01, index_t N01, index_t KBatch) - { - const auto M = c_m_n_grid_desc.GetLength(I0); - const auto N = c_m_n_grid_desc.GetLength(I1); - - constexpr auto M1 = Number{}; - constexpr auto N1 = Number{}; - - const auto M0 = M / M1; - const auto N0 = N / N1; - - const auto M00 = M0 / M01; - const auto N00 = N0 / N01; - - const auto kbatch_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_pass_through_transform(KBatch), - make_unmerge_transform(make_tuple(M00, M01)), - make_unmerge_transform(make_tuple(N00, N01))), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{})); - - const auto c_blockid_to_kbatch_m00_m01_n00_n01_block_cluster_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(KBatch, M00, N00, M01, N01))), - make_tuple(Sequence<0, 1, 2, 3, 4>{}), - make_tuple(Sequence<0>{})); - - const auto c_blockid_to_kbatch_m0_n0_block_cluster_adaptor = - chain_tensor_adaptors(kbatch_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor, - c_blockid_to_kbatch_m00_m01_n00_n01_block_cluster_adaptor); - - return c_blockid_to_kbatch_m0_n0_block_cluster_adaptor; - } - - using CM0N0M1N1M2M3M4N2GridDesc = decltype(MakeCM0N0M1N1M2M3M4N2GridDescriptor(CMNGridDesc{})); - using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1)); - - template - __device__ static void Run(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - FloatAB* __restrict__ p_shared_block, - const ABK0MK1GridDesc& a_b_k0_m_k1_grid_desc, - const BBK0NK1GridDesc& b_b_k0_n_k1_grid_desc, - const CM0N0M1N1M2M3M4N2GridDesc& c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - const CBlockClusterAdaptor& c_block_cluster_adaptor) - { - const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( - p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc.GetElementSpaceSize()); - - const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1); - - // divide block work by [M, N] - const auto block_work_idx = - c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id())); - - const index_t k_batch_id = block_work_idx[I0]; - // HACK: this force m/n_block_data_idx_on_grid into SGPR - const index_t m_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock); - - const index_t n_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock); - - // lds max alignment - constexpr auto max_lds_align = K1; - - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_k0_m_k1_block_desc = [&]() { - if constexpr(ABlockLdsExtraM) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - constexpr auto a_b_k0_m_k1_block_desc = [&]() { - if constexpr(ABlockLdsExtraM) - { - return make_naive_tensor_descriptor( - make_tuple(Number<1>{}, Number{}, Number{}, K1), - make_tuple(Number{} * Number{} * K1, - Number{} * K1, - K1, - I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number<1>{}, Number{}, Number{}, K1), - max_lds_align); - } - }(); - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_k0_n_k1_block_desc = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - constexpr auto b_b_k0_n_k1_block_desc = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number<1>{}, Number{}, Number{}, K1), - make_tuple(Number{} * Number{} * K1, - Number{} * K1, - K1, - I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number<1>{}, Number{}, Number{}, K1), - max_lds_align); - } - }(); - // A matrix blockwise copy - auto a_blockwise_copy = - BlockwiseTensorSliceTransfer_v4, - ABlockTransferThreadSliceLengths_K0_M_K1, - ABlockTransferThreadClusterLengths_K0_M_K1, - ABlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(a_b_k0_m_k1_grid_desc), - decltype(a_b_k0_m_k1_block_desc), - ABlockTransferSrcAccessOrder, - Sequence<0, 2, 1, 3>, - ABlockTransferSrcVectorDim, - 3, - ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_K1, - 1, - 1, - AThreadTransferSrcResetCoordinateAfterRun, - true>( - a_b_k0_m_k1_grid_desc, - make_multi_index(k_batch_id, 0, m_block_data_idx_on_grid, 0), - a_b_k0_m_k1_block_desc, - make_multi_index(0, 0, 0, 0)); - - // B matrix blockwise copy - auto b_blockwise_copy = - BlockwiseTensorSliceTransfer_v4, - BBlockTransferThreadSliceLengths_K0_N_K1, - BBlockTransferThreadClusterLengths_K0_N_K1, - BBlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(b_b_k0_n_k1_grid_desc), - decltype(b_b_k0_n_k1_block_desc), - BBlockTransferSrcAccessOrder, - Sequence<0, 2, 1, 3>, - BBlockTransferSrcVectorDim, - 3, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_K1, - 1, - 1, - BThreadTransferSrcResetCoordinateAfterRun, - true>( - b_b_k0_n_k1_grid_desc, - make_multi_index(k_batch_id, 0, n_block_data_idx_on_grid, 0), - b_b_k0_n_k1_block_desc, - make_multi_index(0, 0, 0, 0)); - - // GEMM definition - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[K0PerBlock, MPerBlock] is in LDS - // b_mtx[K0PerBlock, NPerBlock] is in LDS - // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in - // register - // sanity check - - auto blockwise_gemm = - BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{}; - - auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); - - // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_space_size = - math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align); - - FloatAB* p_a_block = p_shared_block; - FloatAB* p_b_block = p_shared_block + a_block_space_size; - - constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0); - - // hack to control index calculation when iterating over A and B matrix for threadwise copy - constexpr auto a_k0_m_k1_grid_step_hacks = AGridStepHacks{}; - constexpr auto b_k0_n_k1_grid_step_hacks = BGridStepHacks{}; - - // hack to control index calculation when move slice window for A and B matrix for - // threadwise copy - constexpr auto a_k0_m_k1_grid_move_slice_window_step_hack = AGridMoveSliceWindowStepHacks{}; - constexpr auto b_k0_n_k1_grid_move_slice_window_step_hack = BGridMoveSliceWindowStepHacks{}; - - auto a_block_buf = make_dynamic_buffer( - p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer( - p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize()); - - // preload data into LDS - { - a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks); - b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks); - - a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf); - b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf); - } - - // main body - index_t k_block_data_begin = 0; - if constexpr(HasMainKBlockLoop) - { - do - { - a_blockwise_copy.MoveSrcSliceWindow(a_b_k0_m_k1_grid_desc, - a_block_slice_copy_step, - a_k0_m_k1_grid_move_slice_window_step_hack); - b_blockwise_copy.MoveSrcSliceWindow(b_b_k0_n_k1_grid_desc, - b_block_slice_copy_step, - b_k0_n_k1_grid_move_slice_window_step_hack); - - a_blockwise_copy.RunRead( - a_b_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_step_hacks); - - block_sync_lds(); - - b_blockwise_copy.RunRead( - b_b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_step_hacks); - - blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); - - block_sync_lds(); - - a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf); - b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf); - - k_block_data_begin += K0PerBlock; - } while(k_block_data_begin < (K0 - K0PerBlock)); - } - - // tail - { - block_sync_lds(); - - blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); - } - - // output: register to global memory - { - constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc = - blockwise_gemm.GetCM0N0M1N1M2M3M4N2BlockDescriptor(); - - constexpr auto M0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I0); - constexpr auto N0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I1); - constexpr auto M1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I2); - constexpr auto N1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I3); - constexpr auto M2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I4); - constexpr auto M3 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I5); - constexpr auto M4 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I6); - constexpr auto N2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I7); - - constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc = - make_naive_tensor_descriptor_packed(make_tuple( - Number{}, Number{}, I1, I1, Number{}, I1, Number{}, I1)); - - // calculate origin of thread output tensor on global memory - // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = - blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0); - - const index_t m_thread_data_on_grid = - m_block_data_idx_on_grid + c_thread_mtx_on_block[I0]; - - const index_t n_thread_data_on_grid = - n_block_data_idx_on_grid + c_thread_mtx_on_block[I1]; - - constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks = CGridStepHacks{}; - - const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))), - make_tuple(Sequence<0, 1, 2, 3, 4>{}), - make_tuple(Sequence<0>{})); - - const auto m_thread_data_on_grid_idx = - m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex( - make_multi_index(m_thread_data_on_grid)); - - const auto n_thread_data_on_grid_to_n0_n1_n2_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(N0, N1, N2))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - const auto n_thread_data_on_grid_idx = - n_thread_data_on_grid_to_n0_n1_n2_adaptor.CalculateBottomIndex( - make_multi_index(n_thread_data_on_grid)); - - auto c_thread_copy = - ThreadwiseTensorSliceTransfer_v1r3, - CThreadTransferSrcDstAccessOrder, - CThreadTransferSrcDstVectorDim, - CThreadTransferDstScalarPerVector, - CGlobalMemoryDataOperation, - 1, - true>{ - - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - make_multi_index(m_thread_data_on_grid_idx[I0], - n_thread_data_on_grid_idx[I0], - m_thread_data_on_grid_idx[I1], - n_thread_data_on_grid_idx[I1], - m_thread_data_on_grid_idx[I2], - m_thread_data_on_grid_idx[I3], - m_thread_data_on_grid_idx[I4], - n_thread_data_on_grid_idx[I2])}; - - c_thread_copy.Run(c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc, - make_tuple(I0, I0, I0, I0, I0, I0, I0, I0), - c_thread_buf, - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, - c_grid_buf, - c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks); - } - } -}; // namespace ck - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp deleted file mode 100644 index c635da57f4d9f39e0d100650c54583662fa27977..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp +++ /dev/null @@ -1,625 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP -#define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP - -#include "data_type.hpp" -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_blockwise.hpp" - -#include "blockwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -struct GridwiseReduction_xy_to_x_blockwise -{ - using opReduce = typename reduce_binary_operator::opType; - using preUnaryOpType = - typename reduce_unary_operator::preUnaryOp; - using posUnaryOpType = - typename reduce_unary_operator::posUnaryOp; - - static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - using blockwise_reduce = - BlockwiseReduction_2d_block_buffer; - - static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize(); - - static constexpr auto I0 = Number<0>{}; - - template - __device__ static void Run(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global); - - template <> - __device__ static void Run<1>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - (void)indices_global; - - // LDS - __shared__ compType p_in_block_buffer[BlockBufferSize]; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); - auto dst_global_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - - auto in_block_buf = - make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); - StaticBuffer accuValue_buf; - - accuValue_buf(I0) = zeroVal; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - const posUnaryOpType posUnaryOp(divider); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_1d_id = get_block_1d_id(); - - constexpr auto in_block_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_load = - BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>(src2dDesc, - make_multi_index(block_global_1d_id, 0), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - blockwise_src_load.RunRead(src2dDesc, src_global_buf); - blockwise_src_load.RunWrite(in_block_desc, in_block_buf); - - __syncthreads(); - - // do element-wise pre-reduction operation - blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0)); - - blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]); - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert{}(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - threadwise_dst_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf); - } - }; - - template <> - __device__ static void Run<2>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - - // LDS - __shared__ compType p_in_block_buffer[BlockBufferSize]; - __shared__ int block_indices_buffer[BlockBufferSize]; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - auto in_block_val_buf = - make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); - auto in_block_idx_buf = - make_dynamic_buffer(block_indices_buffer, BlockBufferSize); - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_1d_id = get_block_1d_id(); - - constexpr auto in_block_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_load = - BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>(src2dDesc, - make_multi_index(block_global_1d_id, 0), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize; - - int indexOffset = 0; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - // load block data from global to LDS, no use of double buffers (to be improved) - blockwise_src_load.RunRead(src2dDesc, src_global_buf); - blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf); - - __syncthreads(); - - // construct the indices for the current toReduce blocks - blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset); - - // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually - // done here - blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - - blockwise_reduce::Reduce2(in_block_val_buf, - in_block_idx_buf, - BlocksInOneOp, - accuValue_buf(I0), - accuIndex_buf(I0)); - - indexOffset += BlockBufferSize; - - blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert{}(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run(dst1dDesc, - dst_global_val_buf, - ReducedDataDesc, - make_tuple(I0), - priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - } - }; - - template <> - __device__ static void Run<3>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ ws_values_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)origReduceLen; - - // LDS - __shared__ compType p_in_block_buffer[BlockBufferSize]; - __shared__ int block_indices_buffer[BlockBufferSize]; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_val_buf = - make_dynamic_buffer(ws_values_global, - src2dDesc.GetElementSpaceSize(), - type_convert{}(zeroVal)); - const auto src_global_idx_buf = make_dynamic_buffer( - ws_indices_global, src2dDesc.GetElementSpaceSize()); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - auto in_block_val_buf = - make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); - auto in_block_idx_buf = - make_dynamic_buffer(block_indices_buffer, BlockBufferSize); - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_1d_id = get_block_1d_id(); - - constexpr auto in_block_desc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_val_load = - BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>(src2dDesc, - make_multi_index(block_global_1d_id, 0), - in_block_desc, - make_multi_index(0, 0)); - - auto blockwise_src_idx_load = - BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - int, - int, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>(src2dDesc, - make_multi_index(block_global_1d_id, 0), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - // load block data from global to LDS, no use of double buffers (to be improved) - blockwise_src_val_load.RunRead(src2dDesc, src_global_val_buf); - blockwise_src_idx_load.RunRead(src2dDesc, src_global_idx_buf); - blockwise_src_val_load.RunWrite(in_block_desc, in_block_val_buf); - blockwise_src_idx_load.RunWrite(in_block_desc, in_block_idx_buf); - - __syncthreads(); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - - blockwise_reduce::Reduce2(in_block_val_buf, - in_block_idx_buf, - BlocksInOneOp, - accuValue_buf(I0), - accuIndex_buf(I0)); - - blockwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - blockwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert{}(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run(dst1dDesc, - dst_global_val_buf, - ReducedDataDesc, - make_tuple(I0), - priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(block_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - } - }; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp deleted file mode 100644 index adfeacc037492419a3ba1574f75b851aac7cc03c..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp +++ /dev/null @@ -1,503 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP -#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP - -#include "data_type.hpp" -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_threadwise.hpp" - -#include "threadwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -struct GridwiseReduction_xy_to_x_direct_threadwise -{ - using opReduce = typename reduce_binary_operator::opType; - using preUnaryOpType = - typename reduce_unary_operator::preUnaryOp; - using posUnaryOpType = - typename reduce_unary_operator::posUnaryOp; - - static constexpr auto I0 = Number<0>{}; - - template - __device__ static void Run(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global); - - template <> - __device__ static void Run<1>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - (void)indices_global; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); - auto dst_global_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_buf; - - using threadwise_reduce = ThreadReduce; - - StaticBuffer accuValue_buf; - - accuValue_buf(I0) = zeroVal; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - const posUnaryOpType posUnaryOp(divider); - - using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, make_multi_index(thread_global_1d_id, 0)); - - constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength); - - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += GredThreadBufferLength) - { - threadwise_src_load.Run( - src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); - - // do element-wise pre-reduction operation - threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); - - // do the reduction on the Thread Buffer - threadwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0)); - - threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]); - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert{}(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>( - dst1dDesc, make_multi_index(thread_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - threadwise_dst_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf); - }; - - template <> - __device__ static void Run<2>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_buf; - - using threadwise_reduce = ThreadReduce; - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, make_multi_index(thread_global_1d_id, 0)); - - constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength); - - index_t indexStart = 0; - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += GredThreadBufferLength) - { - threadwise_src_load.Run( - src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); - - // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually - // done here - threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); - - // do the reduction on the Thread Buffer - threadwise_reduce::Reduce2( - in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexStart); - - indexStart += GredThreadBufferLength; - - threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert{}(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - false>( - dst1dDesc, make_multi_index(thread_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - }; - - template <> - __device__ static void Run<3>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ ws_values_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)origReduceLen; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_val_buf = - make_dynamic_buffer(ws_values_global, - src2dDesc.GetElementSpaceSize(), - type_convert{}(zeroVal)); - const auto src_global_idx_buf = make_dynamic_buffer( - ws_indices_global, src2dDesc.GetElementSpaceSize()); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_val_buf; - StaticBuffer in_thread_idx_buf; - - using threadwise_reduce = ThreadReduceWithIndicesInput; - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - - using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - - auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, make_multi_index(thread_global_1d_id, 0)); - - auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, make_multi_index(thread_global_1d_id, 0)); - - constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength); - - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += GredThreadBufferLength) - { - threadwise_src_val_load.Run(src2dDesc, - src_global_val_buf, - ThreadBufferDesc, - make_tuple(I0, I0), - in_thread_val_buf); - threadwise_src_idx_load.Run(src2dDesc, - src_global_idx_buf, - ThreadBufferDesc, - make_tuple(I0, I0), - in_thread_idx_buf); - - // do the reduction on the Thread Buffer - threadwise_reduce::Reduce( - in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0)); - - threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert{}(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - false>( - dst1dDesc, make_multi_index(thread_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - false>(dst1dDesc, - make_multi_index(thread_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - }; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp deleted file mode 100644 index 4136dae75ffad891805c8865dfe5e145c9d743e3..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp +++ /dev/null @@ -1,544 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP -#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP - -#include "data_type.hpp" -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_warpwise.hpp" - -#include "threadwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -struct GridwiseReduction_xy_to_x_direct_warpwise -{ - using opReduce = typename reduce_binary_operator::opType; - using preUnaryOpType = - typename reduce_unary_operator::preUnaryOp; - using posUnaryOpType = - typename reduce_unary_operator::posUnaryOp; - - static constexpr auto I0 = Number<0>{}; - - template - __device__ static void Run(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global); - - template <> - __device__ static void Run<1>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - (void)indices_global; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); - auto dst_global_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_buf; - - using warpwise_reduce = - WarpReduce; - - StaticBuffer accuValue_buf; - - accuValue_buf(I0) = zeroVal; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - const posUnaryOpType posUnaryOp(divider); - - using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - index_t warp_global_1d_id = thread_global_1d_id / warpSize; - index_t thread_inwarp_id = thread_global_1d_id % warpSize; - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, - make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); - - constexpr auto in_thread_copy_step = - make_multi_index(0, warpSize * GredAccessesPerThreadInWarp); - - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += warpSize * GredAccessesPerThreadInWarp) - { - threadwise_src_load.Run( - src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); - - // do element-wise pre-reduction operation - warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); - - // do the warp-wise reduction on data of all thread buffers - warpwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0)); - - threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]); - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the warp stores the reduced result to the global location - // representing the Warp - if(thread_inwarp_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert{}(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run( - dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf(I0) * beta; - } - - auto threadwise_dst_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - threadwise_dst_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf); - } - }; - - template <> - __device__ static void Run<2>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)ws_indices_global; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_buf; - - using warpwise_reduce = - WarpReduce; - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - index_t warp_global_1d_id = thread_global_1d_id / warpSize; - index_t thread_inwarp_id = thread_global_1d_id % warpSize; - - auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, - make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); - - constexpr auto in_thread_copy_step = - make_multi_index(0, warpSize * GredAccessesPerThreadInWarp); - - index_t indexOffset = 0; - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += warpSize * GredAccessesPerThreadInWarp) - { - threadwise_src_load.Run( - src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf); - - // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually - // done here - warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf); - - // do the warp-wise reduction on data of all thread buffers - warpwise_reduce::Reduce2( - in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexOffset); - - indexOffset += warpSize * GredAccessesPerThreadInWarp; - - threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the warp stores the reduced result to the global location - // representing the Warp - if(thread_inwarp_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert{}(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run(dst1dDesc, - dst_global_val_buf, - ReducedDataDesc, - make_tuple(I0), - priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - } - }; - - template <> - __device__ static void Run<3>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - srcDataType alpha, - const srcDataType* const __restrict__ ws_values_global, - dstDataType beta, - dstDataType* const __restrict__ p_dst_global, - const int* const __restrict__ ws_indices_global, - int* const __restrict__ indices_global) - { - (void)origReduceLen; - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - const auto src_global_val_buf = - make_dynamic_buffer(ws_values_global, - src2dDesc.GetElementSpaceSize(), - type_convert{}(zeroVal)); - const auto src_global_idx_buf = make_dynamic_buffer( - ws_indices_global, src2dDesc.GetElementSpaceSize()); - auto dst_global_val_buf = make_dynamic_buffer( - p_dst_global, dst1dDesc.GetElementSpaceSize()); - auto dst_global_idx_buf = make_dynamic_buffer( - indices_global, dst1dDesc.GetElementSpaceSize()); - - StaticBuffer - in_thread_val_buf; - StaticBuffer - in_thread_idx_buf; - - using warpwise_reduce = WarpReduceWithIndicesInput; - - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - - using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>; - constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id(); - index_t warp_global_1d_id = thread_global_1d_id / warpSize; - index_t thread_inwarp_id = thread_global_1d_id % warpSize; - - auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, - make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); - - auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2, - 1, - 1, - 1, - false>( - src2dDesc, - make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp)); - - constexpr auto in_thread_copy_step = - make_multi_index(0, warpSize * GredAccessesPerThreadInWarp); - - for(index_t reducedLength = 0; reducedLength < toReduceLength; - reducedLength += warpSize * GredAccessesPerThreadInWarp) - { - threadwise_src_val_load.Run(src2dDesc, - src_global_val_buf, - ThreadBufferDesc, - make_tuple(I0, I0), - in_thread_val_buf); - threadwise_src_idx_load.Run(src2dDesc, - src_global_idx_buf, - ThreadBufferDesc, - make_tuple(I0, I0), - in_thread_idx_buf); - - // do the warp-wise reduction on data of all thread buffers - warpwise_reduce::Reduce( - in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0)); - - threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - // The first thread in the warp stores the reduced result to the global location - // representing the Warp - if(thread_inwarp_id == 0) - { - if(!float_equal_one{}(alpha)) - accuValue_buf(I0) *= type_convert{}(alpha); - - StaticBuffer dstValue_buf; - - dstValue_buf(I0) = type_convert{}(accuValue_buf[I0]); - - if(!float_equal_zero{}(beta)) - { - auto threadwise_dst_load = - ThreadwiseTensorSliceTransfer_v2, - Sequence<0>, - 0, - 1, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - StaticBuffer priorDstValue_buf; - - threadwise_dst_load.Run(dst1dDesc, - dst_global_val_buf, - ReducedDataDesc, - make_tuple(I0), - priorDstValue_buf); - - dstValue_buf(I0) += priorDstValue_buf[I0] * beta; - } - - auto threadwise_dst_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - auto threadwise_dst_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(dst1dDesc, - make_multi_index(warp_global_1d_id)); - - threadwise_dst_val_store.Run( - ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf); - threadwise_dst_idx_store.Run( - ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf); - } - }; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp b/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp deleted file mode 100644 index feee2b594a365d8eecebbb56134d813aaf6c3814..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp +++ /dev/null @@ -1,376 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP -#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_blockwise.hpp" - -#include "blockwise_tensor_slice_transfer.hpp" - -namespace ck { - -template -struct GridwiseReduction_xy_to_x_multiblock -{ - using opReduce = typename reduce_binary_operator::opType; - using preUnaryOpType = typename reduce_unary_operator::preUnaryOp; - using posUnaryOpType = typename reduce_unary_operator::posUnaryOp; - - static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{})); - using blockwise_reduce = - BlockwiseReduction_2d_block_buffer; - - static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize(); - - static constexpr auto I0 = Number<0>{}; - - template - __device__ static void Run(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - int BlkGroupSize, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - srcDataType* const __restrict__ ws_values_global, - int* const __restrict__ ws_indices_global); - - template <> - __device__ static void Run<1>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - int BlkGroupSize, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - srcDataType* const __restrict__ ws_values_global, - int* const __restrict__ ws_indices_global) - { - (void)ws_indices_global; - - (void)alpha; // unused - (void)beta; // unused - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - // LDS - __shared__ compType p_in_block_buffer[BlockBufferSize]; - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); - auto workspace_global_buf = make_dynamic_buffer( - ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize); - - auto in_block_buf = - make_dynamic_buffer(p_in_block_buffer, BlockBufferSize); - StaticBuffer accuValue_buf; - - accuValue_buf(I0) = zeroVal; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_id = get_block_1d_id(); - const index_t blkgroup_id = block_global_id / BlkGroupSize; - const index_t block_local_id = block_global_id % BlkGroupSize; - - const index_t reduceSizePerBlock = - (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) / - BlockBufferSize) * - BlockBufferSize; - - constexpr auto in_block_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>( - src2dDesc, - make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - blockwise_src_load.RunRead(src2dDesc, src_global_buf); - blockwise_src_load.RunWrite(in_block_desc, in_block_buf); - __syncthreads(); - - // do element-wise pre-reduction operation - blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0)); - - blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - const auto workspace_desc = - make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize)); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - auto threadwise_workspace_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(workspace_desc, - make_multi_index(block_global_id)); - - threadwise_workspace_store.Run(ReducedDataDesc, - make_tuple(I0), - accuValue_buf, - workspace_desc, - workspace_global_buf); - } - }; - - template <> - __device__ static void Run<2>(const src2dDescType& src2dDesc, - const dst1dDescType& dst1dDesc, - int origReduceLen, - int BlkGroupSize, - srcDataType alpha, - const srcDataType* const __restrict__ p_src_global, - dstDataType beta, - srcDataType* const __restrict__ ws_values_global, - int* const __restrict__ ws_indices_global) - { - (void)alpha; // unused - (void)beta; // unused - - const auto zeroVal = opReduce::GetReductionZeroVal(); - - // LDS - __shared__ compType p_in_block_values_buffer[BlockBufferSize]; - __shared__ int p_in_block_indices_buffer[BlockBufferSize]; - - const auto src_global_buf = make_dynamic_buffer( - p_src_global, src2dDesc.GetElementSpaceSize(), type_convert{}(zeroVal)); - auto workspace_global_val_buf = make_dynamic_buffer( - ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize); - auto workspace_global_idx_buf = make_dynamic_buffer( - ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize); - - auto in_block_val_buf = - make_dynamic_buffer(p_in_block_values_buffer, BlockBufferSize); - auto in_block_idx_buf = make_dynamic_buffer( - p_in_block_indices_buffer, BlockBufferSize); - StaticBuffer accuValue_buf; - StaticBuffer accuIndex_buf; - - accuValue_buf(I0) = zeroVal; - accuIndex_buf(I0) = 0; - - const auto toReduceLength = src2dDesc.GetLength(Number<1>{}); - const int divider = origReduceLen; - - const preUnaryOpType preUnaryOp(divider); - - const index_t thread_local_id = get_thread_local_1d_id(); - const index_t block_global_id = get_block_1d_id(); - const index_t blkgroup_id = block_global_id / BlkGroupSize; - const index_t block_local_id = block_global_id % BlkGroupSize; - - const index_t reduceSizePerBlock = - (((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) / - BlockBufferSize) * - BlockBufferSize; - - constexpr auto in_block_desc = make_naive_tensor_descriptor_packed( - make_tuple(Number<1>{}, Number{})); - - using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>; - using ThreadClusterLengths = Sequence<1, BlockSize>; - - auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4, - ThreadSliceLengths, - ThreadClusterLengths, - Sequence<0, 1>, - srcDataType, - compType, - src2dDescType, - decltype(in_block_desc), - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - 1, - 1, - 1, - 1, - false, - true>( - src2dDesc, - make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock), - in_block_desc, - make_multi_index(0, 0)); - - constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize); - - const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize; - - int indexOffset = block_local_id * reduceSizePerBlock; - - for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks; - reducedBlocks += GredAccessesPerThreadInBlock) - { - blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset); - - blockwise_src_load.RunRead(src2dDesc, src_global_buf); - blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf); - - __syncthreads(); - - // unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually - // done here - blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf); - - index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock) - ? GredAccessesPerThreadInBlock - : toReduceBlocks - reducedBlocks; - - blockwise_reduce::Reduce2(in_block_val_buf, - in_block_idx_buf, - BlocksInOneOp, - accuValue_buf(I0), - accuIndex_buf(I0)); - - indexOffset += BlockBufferSize; - - blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step); - } - - constexpr auto ReducedDataDesc = - make_naive_tensor_descriptor_packed(make_tuple(Number<1>{})); - - const auto workspace_desc = - make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize)); - - // The first thread in the block stores the reduced result to the global location - // representing the block - if(thread_local_id == 0) - { - auto threadwise_workspace_val_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(workspace_desc, - make_multi_index(block_global_id)); - - auto threadwise_workspace_idx_store = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0>, - 0, - 1, - InMemoryDataOperationEnum_t::Set, - 1, - true>(workspace_desc, - make_multi_index(block_global_id)); - - threadwise_workspace_val_store.Run(ReducedDataDesc, - make_tuple(I0), - accuValue_buf, - workspace_desc, - workspace_global_val_buf); - threadwise_workspace_idx_store.Run(ReducedDataDesc, - make_tuple(I0), - accuIndex_buf, - workspace_desc, - workspace_global_idx_buf); - } - }; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp deleted file mode 100644 index 046d3311aa767e9e53653ce04fa6a436d9ef6525..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp +++ /dev/null @@ -1,271 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP -#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP - -#include "data_type.hpp" - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_binop.hpp" - -namespace ck { - -template -struct BlockwiseReduction_2d_block_buffer -{ - using compType = typename opReduce::dataType; - - static constexpr auto buffer2dDesc = buffer2dDescType{}; - - static constexpr index_t BlockSize = - blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{}); - static constexpr index_t NumBlocks = - blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{}); - using binop = detail::binop_with_nan_check; - - // This interface does not accumulate on indices - template - __device__ static void - Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData) - { - const index_t thread_local_id = get_thread_local_1d_id(); - compType lAccuData = opReduce::GetReductionZeroVal(); - - index_t offset; - for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) - { - offset = blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd)); - compType opData = type_convert{}(block_buffer[offset]); - - binop::calculate(lAccuData, opData); - } - - offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); - - block_buffer(offset) = lAccuData; - - __syncthreads(); - - for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2) - { - if(thread_local_id < indOffset) - { - index_t offset1 = - blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); - - index_t offset2 = - blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0)); - - compType opData1 = type_convert{}(block_buffer[offset1]); - compType opData2 = type_convert{}(block_buffer[offset2]); - binop::calculate(opData1, opData2); - block_buffer(offset1) = type_convert{}(opData1); - } - - __syncthreads(); - } - - if(thread_local_id == 0) - { - compType tmpVal = type_convert{}(block_buffer[0]); - - binop::calculate(accuData, tmpVal); - } - }; - - // This interface accumulates on both data values and indices - template - __device__ static void Reduce2(BufferType& block_buffer, - IdxBufferType& block_indices_buffer, - index_t toReduceBlocks, - compType& accuData, - int& accuIndex) - { - const index_t thread_local_id = get_thread_local_1d_id(); - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; - - if constexpr(blockIsOneRow) - { - for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) - { - for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2) - { - if(thread_local_id % (indOffset * 2) == 0) - { - index_t offset1 = - buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id)); - index_t offset2 = buffer2dDesc.CalculateOffset( - make_tuple(otherDimInd, thread_local_id + indOffset)); - - compType currVal1 = type_convert{}(block_buffer[offset1]); - compType currVal2 = type_convert{}(block_buffer[offset2]); - int currIndex1 = block_indices_buffer[offset1]; - int currIndex2 = block_indices_buffer[offset2]; - - binop::calculate(currVal1, currVal2, currIndex1, currIndex2); - block_buffer(offset1) = type_convert{}(currVal1); - block_indices_buffer(offset1) = currIndex1; - } - __syncthreads(); - } - } - - if(thread_local_id == 0) - { - for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) - { - index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0)); - - compType tmpVal = type_convert{}(block_buffer[offset]); - int tmpIndex = block_indices_buffer[offset]; - - binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex); - } - - binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex); - } - } - else - { - index_t offset; - - for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) - { - offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd)); - compType currVal = type_convert{}(block_buffer[offset]); - int currIndex = block_indices_buffer[offset]; - - binop::calculate(lAccuData, currVal, lAccuIndex, currIndex); - } - - offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); - - block_buffer(offset) = lAccuData; - block_indices_buffer(offset) = lAccuIndex; - - __syncthreads(); - - for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2) - { - if(thread_local_id % (indOffset * 2) == 0) - { - index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0)); - index_t offset2 = - buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0)); - - compType currVal1 = type_convert{}(block_buffer[offset1]); - compType currVal2 = type_convert{}(block_buffer[offset2]); - int currIndex1 = block_indices_buffer[offset1]; - int currIndex2 = block_indices_buffer[offset2]; - - binop::calculate(currVal1, currVal2, currIndex1, currIndex2); - block_buffer(offset1) = type_convert{}(currVal1); - block_indices_buffer(offset1) = currIndex1; - } - - __syncthreads(); - } - - if(thread_local_id == 0) - { - compType tmpVal = type_convert{}(block_buffer[0]); - int tmpIndex = block_indices_buffer[0]; - - binop::calculate(accuData, tmpVal, accuIndex, tmpIndex); - } - } - }; - - template - __device__ static void set_buffer_value(BufferType& block_buffer, compType value) - { - index_t thread_id = get_thread_local_1d_id(); - - for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++) - { - index_t offset = blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd)); - - block_buffer(offset) = value; - - __syncthreads(); - } - }; - - // Initialize the block-wise indices buffer, the index for each element in the block-wise data - // buffer - // is calculated according to its position in the buffer and the global starting index - template - __device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart) - { - index_t thread_id = get_thread_local_1d_id(); - - for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++) - { - index_t offset = blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd)); - - block_indices_buffer(offset) = offset + indexStart; - - __syncthreads(); - } - }; - - // Execute unary operation on the block buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer) - { - index_t thread_id = get_thread_local_1d_id(); - - for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++) - { - index_t offset = blockIsOneRow - ? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id)) - : buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd)); - - block_buffer(offset) = unary_op(block_buffer[offset]); - - __syncthreads(); - } - }; -}; - -}; // end of namespace ck - -#endif diff --git a/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp deleted file mode 100644 index 2956606a6bab676658764c3427cd8c16154ebf39..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp +++ /dev/null @@ -1,141 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP -#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP - -#include "data_type.hpp" - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_binop.hpp" - -namespace ck { - -template -struct ThreadReduce -{ - using compType = typename opReduce::dataType; - - static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!"); - - static_assert( - std::is_same::value, - "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"); - - static constexpr index_t ThreadBufferLen = BufferType::Size(); - - using binop = detail::binop_with_nan_check; - - // This interface does not accumulate on indices - __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { binop::calculate(accuData, thread_buffer[I]); }); - }; - - // This interface accumulates on both data values and indices and - // is called by Direct_ThreadWise reduction method at first-time reduction - __device__ static void - Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - int currIndex = I + indexStart; - binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex); - }); - }; - - // Set the elements in the per-thread buffer to a specific value - // cppcheck-suppress constParameter - __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); - }; - - // Execute unary operation on the per-thread buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); - }; -}; - -template -struct ThreadReduceWithIndicesInput -{ - using compType = typename opReduce::dataType; - - static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!"); - static_assert(IdxBufferType::IsStaticBuffer(), - "Thread-wise reduction needs use StaticBuffer for indices!"); - - static_assert( - std::is_same::value, - "Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"); - static_assert(std::is_same::value, - "Indices type of StaticBuffer for Thread-wise reduction should be index_t!"); - - static_assert(BufferType::Size() == IdxBufferType::Size(), - "StaticBuffers for data and indices should have the same sizes!"); - - static constexpr index_t ThreadBufferLen = BufferType::Size(); - - using binop = detail::binop_with_nan_check; - - // This interface accumulates on both data values and indices and - // is called by Direct_ThreadWise reduction method at second-time reduction - __device__ static void Reduce(const BufferType& thread_buffer, - const IdxBufferType& thread_indices_buffer, - compType& accuData, - int& accuIndex) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]); - }); - }; - - // Set the elements in the per-thread buffer to a specific value - // cppcheck-suppress constParameter - __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); - }; - - // Execute unary operation on the per-thread buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); - }; -}; - -}; // end of namespace ck - -#endif diff --git a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp b/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp deleted file mode 100644 index 9687d2d8c8650f1e4fe57653ab809641665fb103..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp +++ /dev/null @@ -1,371 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP -#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP - -#include "data_type.hpp" - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" -#include "reduction_functions_binop.hpp" - -namespace ck { - -template -struct WarpReduce -{ - using compType = typename opReduce::dataType; - using binop = detail::binop_with_nan_check; - - static_assert(BufferType::IsStaticBuffer(), - "Per-thread buffer for WarpWise reduction should be StaticBuffer!"); - static_assert(std::is_same::value, - "Data type of per-thread StaticBuffer for WarpWise reduction should be same as " - "the compType!"); - - static constexpr index_t ThreadBufferLen = BufferType::Size(); - static constexpr bool have_builtin_shuffle = - std::is_same::value || std::is_same::value; - - // This interface does not accumulate on indices - __device__ static void Reduce(const BufferType& thread_buffer, compType& accuData) - { - if constexpr(have_builtin_shuffle) - ReduceImpl1(thread_buffer, accuData); - else - ReduceImpl2(thread_buffer, accuData); - }; - - // This interface implementation uses HIP built-in device shuffling functions - __device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); }); - - // synchronize among all threads in this warp - __all(1); - - for(index_t stride = warpSize / 2; stride > 0; stride /= 2) - { - compType tmpVal = __shfl_down(lAccuData, stride, warpSize); - binop::calculate(lAccuData, tmpVal); - __all(1); - } - - binop::calculate(accuData, lAccuData); - }; - - // This interface implementation does not use HIP built-in device shuffling functions - // since for fp16, built-in shuffling functions is not provided by HIP - __device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); }); - - __syncthreads(); - - index_t thread_id = get_thread_local_1d_id(); - index_t warpId = thread_id / warpSize; - index_t thread_inwarp_id = thread_id % warpSize; - - __shared__ compType shuffle_buffer[BlockSize]; - - compType* myBuffer = &shuffle_buffer[warpId * warpSize]; - - myBuffer[thread_inwarp_id] = lAccuData; - - __syncthreads(); - - for(index_t stride = warpSize / 2; stride > 0; stride /= 2) - { - if(thread_inwarp_id < stride) - { - compType currVal1 = myBuffer[thread_inwarp_id]; - compType currVal2 = myBuffer[thread_inwarp_id + stride]; - - binop::calculate(currVal1, currVal2); - - myBuffer[thread_inwarp_id] = currVal1; - } - - __syncthreads(); - } - if(thread_inwarp_id == 0) - binop::calculate(accuData, myBuffer[0]); - }; - - // This interface accumulates on both data values and indices and is called by Direct_WarpWise - // reduction method at first-time reduction - __device__ static void - Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart) - { - if constexpr(have_builtin_shuffle) - Reduce2Impl1(thread_buffer, accuData, accuIndex, indexStart); - else - Reduce2Impl2(thread_buffer, accuData, accuIndex, indexStart); - }; - - // This interface implementation uses HIP built-in device shuffling functions - __device__ static void Reduce2Impl1(const BufferType& thread_buffer, - compType& accuData, - int& accuIndex, - int indexStart) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; - index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize; - - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart; - binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex); - }); - - // synchronize among all threads in this warp - __all(1); - - for(index_t stride = 1; stride < warpSize; stride *= 2) - { - compType tmpVal = __shfl_down(lAccuData, stride, warpSize); - int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize); - - binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex); - __all(1); - } - - if(thread_inwarp_id == 0) - binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex); - }; - - // This interface implementation does not use HIP built-in device shuffling functions since for - // fp16, built-in shuffling functions is not provided by HIP - __device__ static void Reduce2Impl2(const BufferType& thread_buffer, - compType& accuData, - int& accuIndex, - int indexStart) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; - index_t thread_id = get_thread_local_1d_id(); - index_t warpId = thread_id / warpSize; - index_t thread_inwarp_id = thread_id % warpSize; - - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart; - binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex); - }); - - __shared__ compType shuffle_data_buffer[BlockSize]; - __shared__ int shuffle_indices_buffer[BlockSize]; - - compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize]; - int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize]; - - myDataBuffer[thread_inwarp_id] = lAccuData; - myIndicesBuffer[thread_inwarp_id] = lAccuIndex; - - __syncthreads(); - - for(index_t stride = 1; stride < warpSize; stride *= 2) - { - compType currVal1 = myDataBuffer[thread_inwarp_id]; - compType currVal2 = myDataBuffer[thread_inwarp_id + stride]; - int currIndex1 = myIndicesBuffer[thread_inwarp_id]; - int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride]; - - binop::calculate(currVal1, currVal2, currIndex1, currIndex2); - - myDataBuffer[thread_inwarp_id] = currVal1; - myIndicesBuffer[thread_inwarp_id] = currIndex1; - - __syncthreads(); - } - - if(thread_inwarp_id == 0) - binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]); - }; - - // cppcheck-suppress constParameter - __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); - - __all(1); - }; - - // Execute unary operation on the per-thread buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); - - __all(1); - }; -}; - -template -struct WarpReduceWithIndicesInput -{ - using compType = typename opReduce::dataType; - using binop = detail::binop_with_nan_check; - - static_assert(BufferType::IsStaticBuffer(), - "Per-thread buffer for WarpWise reduction should be StaticBuffer!"); - static_assert(IdxBufferType::IsStaticBuffer(), - "Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!"); - - static_assert(std::is_same::value, - "Data type of per-thread StaticBuffer for WarpWise reduction should be same as " - "the compType!"); - static_assert( - std::is_same::value, - "Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!"); - - static_assert(BufferType::Size() == IdxBufferType::Size(), - "StaticBuffers for data and indices should have the same sizes!"); - - static constexpr index_t ThreadBufferLen = BufferType::Size(); - static constexpr bool have_builtin_shuffle = - std::is_same::value || std::is_same::value; - - // This interface accumulates on both data values and indices and is called by Direct_WarpWise - // reduction method at second-time reduction - __device__ static void Reduce(const BufferType& thread_buffer, - const IdxBufferType& thread_indices_buffer, - compType& accuData, - int& accuIndex) - { - if constexpr(have_builtin_shuffle) - ReduceImpl1(thread_buffer, thread_indices_buffer, accuData, accuIndex); - else - ReduceImpl2(thread_buffer, thread_indices_buffer, accuData, accuIndex); - }; - - // This interface implementation uses HIP built-in device shuffling functions - __device__ static void ReduceImpl1(const BufferType& thread_buffer, - const IdxBufferType& thread_indices_buffer, - compType& accuData, - int& accuIndex) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; - - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]); - }); - - // synchronize among all threads in this warp - __all(1); - - for(index_t stride = 1; stride < warpSize; stride *= 2) - { - compType tmpVal = __shfl_down(lAccuData, stride, warpSize); - int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize); - - binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex); - __all(1); - } - - binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex); - }; - - // This interface implementation does not use HIP built-in device shuffling functions - // since for fp16, built-in shuffling functions is not provided by HIP - __device__ static void ReduceImpl2(const BufferType& thread_buffer, - const IdxBufferType& thread_indices_buffer, - compType& accuData, - int& accuIndex) - { - compType lAccuData = opReduce::GetReductionZeroVal(); - int lAccuIndex = 0; - index_t thread_id = get_thread_local_1d_id(); - index_t warpId = thread_id / warpSize; - index_t thread_inwarp_id = thread_id % warpSize; - - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { - binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]); - }); - - __shared__ compType shuffle_data_buffer[BlockSize]; - __shared__ int shuffle_indices_buffer[BlockSize]; - - compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize]; - int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize]; - - myDataBuffer[thread_inwarp_id] = lAccuData; - myIndicesBuffer[thread_inwarp_id] = lAccuIndex; - - __syncthreads(); - - for(index_t stride = 1; stride < warpSize; stride *= 2) - { - compType currVal1 = myDataBuffer[thread_inwarp_id]; - compType currVal2 = myDataBuffer[thread_inwarp_id + stride]; - int currIndex1 = myIndicesBuffer[thread_inwarp_id]; - int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride]; - - binop::calculate(currVal1, currVal2, currIndex1, currIndex2); - - myDataBuffer[thread_inwarp_id] = currVal1; - myIndicesBuffer[thread_inwarp_id] = currIndex1; - - __syncthreads(); - } - - if(thread_inwarp_id == 0) - binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]); - }; - - // cppcheck-suppress constParameter - __device__ static void set_buffer_value(BufferType& thread_buffer, compType value) - { - static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; }); - - __all(1); - }; - - // Execute unary operation on the per-thread buffer elements - template - __device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer) - { - static_for<0, ThreadBufferLen, 1>{}( - [&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); }); - - __all(1); - }; -}; - -}; // end of namespace ck - -#endif diff --git a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp deleted file mode 100644 index 8b75381026820317bb856ed4cb6807ff334f4d77..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp +++ /dev/null @@ -1,223 +0,0 @@ -#ifndef CK_THREADWISE_CONTRACTION_DLOPS_HPP -#define CK_THREADWISE_CONTRACTION_DLOPS_HPP - -#include "common_header.hpp" -#include "math.hpp" - -namespace ck { - -// C[TM0, TM1, TN0, TN1] += A[TK, TM0, TM1] * B[TK, TN0, TN1] -// Tensor element can be vectorized data -// Assume: -// 1. AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, CThreadDesc_TM0_TM1_TN0_TN1 are -// known at compile-time -// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time -template ::type = false> -struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1 -{ - __device__ constexpr ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1() - { - static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() && - BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() && - CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - // TODO: sanity-check: compare AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, - // CThreadDesc_TM0_TM1_TN0_TN1 Size with KLenghts, TMLengths and TNLengths - - // TODO remove this restriction - static_assert(TKLengths::Size() == 1 && TMLengths::Size() == 2 && TNLengths::Size() == 2, - "wrong!"); - } - - template - __device__ static void Run(const ABuffer& a_buf, - AOriginIdx, - const BBuffer& b_buf, - BOriginIdx, - CBuffer& c_buf, - COriginIdx) - { - static_assert(is_known_at_compile_time>::value && - is_known_at_compile_time>::value && - is_known_at_compile_time>::value, - "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); - - static_assert( - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value && - "wrong! inconsistent type"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - constexpr auto TK = TKLengths{}[I0]; - constexpr auto TM0 = TMLengths{}[I0]; - constexpr auto TM1 = TMLengths{}[I1]; - constexpr auto TN0 = TNLengths{}[I0]; - constexpr auto TN1 = TNLengths{}[I1]; - - constexpr auto a_origin_idx = to_multi_index(AOriginIdx{}); - constexpr auto b_origin_idx = to_multi_index(BOriginIdx{}); - constexpr auto c_origin_idx = to_multi_index(COriginIdx{}); - - static_for<0, TK, 1>{}([&](auto tk) { - static_for<0, TM0, 1>{}([&](auto tm0) { - static_for<0, TM1, 1>{}([&](auto tm1) { - static_for<0, TN0, 1>{}([&](auto tn0) { - static_for<0, TN1, 1>{}([&](auto tn1) { - constexpr index_t a_offset = - AThreadDesc_TK0_TM0_TM1_TK1{}.CalculateOffset( - a_origin_idx + make_multi_index(tk, tm0, tm1)); - constexpr index_t b_offset = - BThreadDesc_TK0_TN0_TN1_TK1{}.CalculateOffset( - b_origin_idx + make_multi_index(tk, tn0, tn1)); - constexpr index_t c_offset = - CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset( - c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1)); - - inner_product(a_buf[Number{}], - b_buf[Number{}], - c_buf(Number{})); - }); - }); - }); - }); - }); - } -}; - -// C[TM0, TM1, TN0, TN1] += A[TK0, TM0, TM1, TK1] * B[TK0, TN0, TN1, TK1] -// Tensor element can be vectorized data -// Assume: -// 1. AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, CThreadDesc_TM0_TM1_TN0_TN1 are -// known at compile-time -// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time -template ::type = false> -struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1 -{ - __device__ constexpr ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1() - { - static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() && - BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() && - CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - // TODO: sanity-check: compare AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, - // CThreadDesc_TM0_TM1_TN0_TN1 Size with KLenghts, TMLengths and TNLengths - - // TODO remove this restriction - static_assert(TKLengths::Size() == 2 && TMLengths::Size() == 2 && TNLengths::Size() == 2, - "wrong!"); - } - - template - __device__ static void Run(const ABuffer& a_buf, - AOriginIdx, - const BBuffer& b_buf, - BOriginIdx, - CBuffer& c_buf, - COriginIdx) - { - static_assert(is_known_at_compile_time>::value && - is_known_at_compile_time>::value && - is_known_at_compile_time>::value, - "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); - - static_assert( - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value && - "wrong! inconsistent type"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - constexpr index_t TK0 = TKLengths{}[I0]; - constexpr index_t TK1 = TKLengths{}[I1]; - constexpr index_t TM0 = TMLengths{}[I0]; - constexpr index_t TM1 = TMLengths{}[I1]; - constexpr index_t TN0 = TNLengths{}[I0]; - constexpr index_t TN1 = TNLengths{}[I1]; - - constexpr auto a_origin_idx = to_multi_index(AOriginIdx{}); - constexpr auto b_origin_idx = to_multi_index(BOriginIdx{}); - constexpr auto c_origin_idx = to_multi_index(COriginIdx{}); - - static_for<0, TK0, 1>{}([&](auto tk0) { - static_for<0, TM0, 1>{}([&](auto tm0) { - static_for<0, TM1, 1>{}([&](auto tm1) { - static_for<0, TN0, 1>{}([&](auto tn0) { - static_for<0, TN1, 1>{}([&](auto tn1) { - vector_type a_vec; - vector_type b_vec; - - static_for<0, TK1, 1>{}([&](auto tk1) { - constexpr index_t a_offset = - AThreadDesc_TK0_TM0_TM1_TK1{}.CalculateOffset( - a_origin_idx + make_multi_index(tk0, tm0, tm1, tk1)); - - constexpr index_t b_offset = - BThreadDesc_TK0_TN0_TN1_TK1{}.CalculateOffset( - b_origin_idx + make_multi_index(tk0, tn0, tn1, tk1)); - - a_vec.template AsType()(tk1) = a_buf[Number{}]; - b_vec.template AsType()(tk1) = b_buf[Number{}]; - }); - - using a_vector_t = typename vector_type::type; - using b_vector_t = typename vector_type::type; - - constexpr index_t c_offset = - CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset( - c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1)); - - inner_product( - a_vec.template AsType()[I0], - b_vec.template AsType()[I0], - c_buf(Number{})); - }); - }); - }); - }); - }); - } -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp deleted file mode 100644 index f6c15fd85ac1492b597d94ae19bb5d08b78b177e..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp +++ /dev/null @@ -1,157 +0,0 @@ -#ifndef CK_THREADWISE_GEMM_DLOPS_V3_HPP -#define CK_THREADWISE_GEMM_DLOPS_V3_HPP - -#include "common_header.hpp" -#include "math.hpp" - -namespace ck { - -// C[M, N] += transpose(A[K, M]) * B[K, N] -// Element of matrix can be vectorized data -// Assume: -// 1. ADesc, BDesc, CDesc are known at compile-time -// 2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time -template ::type = false> -struct ThreadwiseGemmDlops_km_kn_mn_v3 -{ - template - __device__ static void Run(const ABuffer& a_buf, - AOriginIdx, - const BBuffer& b_buf, - BOriginIdx, - CBuffer& c_buf, - COriginIdx) - { - static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() && - CDesc::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - static_assert(is_known_at_compile_time>::value && - is_known_at_compile_time>::value && - is_known_at_compile_time>::value, - "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); - - static_assert( - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value && - "wrong! inconsistent type"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - constexpr auto E = ADesc{}.GetLength(I0); - constexpr auto K = ADesc{}.GetLength(I1); - - constexpr auto a_origin_idx = to_multi_index(AOriginIdx{}); - constexpr auto b_origin_idx = to_multi_index(BOriginIdx{}); - constexpr auto c_origin_idx = to_multi_index(COriginIdx{}); - - static_for<0, E, 1>{}([&](auto e) { - static_for<0, K, 1>{}([&](auto k) { - constexpr index_t a_offset = - ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k)); - - if constexpr(H == 2 && W == 2) - { - constexpr index_t b_offset_0 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0)); - constexpr index_t b_offset_1 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1)); - constexpr index_t b_offset_2 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0)); - constexpr index_t b_offset_3 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1)); - - constexpr index_t c_offset_0 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0)); - constexpr index_t c_offset_1 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1)); - constexpr index_t c_offset_2 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0)); - constexpr index_t c_offset_3 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1)); - - amd_assembly_outer_product_1x4(a_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - c_buf(Number{}), - c_buf(Number{}), - c_buf(Number{}), - c_buf(Number{})); - } - else if constexpr(H == 4 && W == 1) - { - constexpr index_t b_offset_0 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0)); - constexpr index_t b_offset_1 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0)); - constexpr index_t b_offset_2 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0)); - constexpr index_t b_offset_3 = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0)); - - constexpr index_t c_offset_0 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0)); - constexpr index_t c_offset_1 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0)); - constexpr index_t c_offset_2 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0)); - constexpr index_t c_offset_3 = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0)); - - amd_assembly_outer_product_1x4(a_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - b_buf[Number{}], - c_buf(Number{}), - c_buf(Number{}), - c_buf(Number{}), - c_buf(Number{})); - } - else - { - static_for<0, H, 1>{}([&](auto h) { - static_for<0, W, 1>{}([&](auto w) { - constexpr index_t b_offset = - BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w)); - - constexpr index_t c_offset = - CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w)); - -#if 0 - c_buf(Number{}) += inner_product_with_conversion{}( - a_buf[Number{}], b_buf[Number{}]); -#else - amd_assembly_inner_product(a_buf[Number{}], - b_buf[Number{}], - c_buf(Number{})); -#endif - }); - }); - } - }); - }); - } -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp deleted file mode 100644 index 7e3f6b3489a89732e16f13fb41a2a2a3eef128ab..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp +++ /dev/null @@ -1,1436 +0,0 @@ -#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP -#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" - -namespace ck { - -// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory -// and sometimes useless instructions: -// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument -// instead -// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same -// tensor coordinate instead -// 3. Don't use a pointer to VGPR buffer, use vector instead - -namespace detail { -// TODO: How to fix this? It uses an struct instead of lambda because lambda -// doesn't have constructor -template -struct lambda_scalar_per_access -{ - __host__ __device__ constexpr auto operator()(index_t i) const - { - return (i == VectorDim) ? ScalarPerVector : 1; - } -}; - -template -struct lambda_scalar_step_in_vector -{ - __host__ __device__ constexpr auto operator()(index_t i) const - { - return (i == VectorDim) ? 1 : 0; - } -}; -} // namespace detail - -// Assume: -// 1. src: -// 1. SrcDesc is known at compile-time -// 2. SrcBuffer is StaticBuffer -// 3. SrcSliceOrginIdx is known at compile-time -// 2. dst: -// 1. DstDesc is not known at compile-time -// 2. DstBuffer is DynamicBuffer -// 3. DstSliceOrginIdx is not known at compile time -template ::type = false> -struct ThreadwiseTensorSliceTransfer_v1r3 -{ - static constexpr index_t nDim = SliceLengths::Size(); - - using Index = MultiIndex; - - using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - - using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); - - __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3(const DstDesc& dst_desc, - const Index& dst_slice_origin_idx) - : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)) - { - static_assert(SrcDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc need to known at compile-time"); - } - - __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) - { - dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); - } - - template - __device__ void Run(const SrcDesc&, - const SrcSliceOriginIdx&, - const SrcBuffer& src_buf, - const DstDesc& dst_desc, - DstBuffer& dst_buf, - const DstStepHacks& dst_step_hacks) - { - static_assert(SrcDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc need to known at compile-time"); - - static_assert(is_known_at_compile_time>::value, - "wrong! SrcSliceOrigin need to known at compile-time"); - - static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer"); - - // SrcDesc and src_slice_origin_idx are known at compile-time - constexpr auto src_desc = remove_cvref_t{}; - constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{}); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto dst_scalar_step_in_vector = - generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - - constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // make forward steps - const auto dst_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, forward_step_idx, dst_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto dst_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, backward_step_idx, dst_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_idx[I0]; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] - ? ordered_access_idx[i] - : ordered_access_lengths[i] - 1 - ordered_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - dst_scalar_per_access; - }(); - - typename vector_type_maker::type dst_vector; - - using dst_vector_t = - typename vector_type_maker::type::type; - - // copy data from src_buf into dst_vector - static_for<0, DstScalarPerVector, 1>{}([&](auto i) { - constexpr index_t src_offset = src_desc.CalculateOffset( - src_slice_origin_idx + dst_data_idx + i * dst_scalar_step_in_vector); - - dst_vector.template AsType()(i) = - type_convert{}(src_buf[Number{}]); - }); - - const bool is_dst_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); - - // copy data from dst_vector into dst_buf - if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) - { - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) - { - dst_buf.template AtomicAdd( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); - } - } - }); - }); - - // move dst coordinate back to slice origin (or not) - if constexpr(DstResetCoordinateAfterRun) - { - const auto dst_reset_step = - make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); - - move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); - } - } - - template - __device__ void Run(const SrcDesc&, - const SrcSliceOriginIdx&, - const SrcBuffer& src_buf, - const DstDesc& dst_desc, - DstBuffer& dst_buf) - { - constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto dst_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_step_hacks); - } - - __device__ static constexpr auto GetDstCoordinateResetStep() - { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_lengths[I0] - 1; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index after last iteration in Run(), if it has not being reset by - // RunWrite() - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - dst_scalar_per_access; - }(); - - // - constexpr auto reset_dst_data_step = [&]() { - Index reset_dst_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); - - return reset_dst_data_step_; - }(); - - return reset_dst_data_step; - } - - // dst_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, - const Index& dst_slice_origin_step_idx) - { - // if dst coord was not reset by Run(), then need to adjust the step here - const auto adjusted_step_idx = - DstResetCoordinateAfterRun ? dst_slice_origin_step_idx - : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx); - - move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); - } - - private: - DstCoord dst_coord_; -}; // namespace ck - -// Assume: -// 1. src: -// 1. SrcDesc is not known at compile-time -// 2. SrcBuffer is DynamicBuffer -// 3. src_slice_origin_idx is not known at compile-time -// 2. dst: -// 1. DstDesc is known at compile-time -// 2. DstBuffer is StaticBuffer -// 3. dst_slice_origin_idx is known at compile-time -template ::type = false> -struct ThreadwiseTensorSliceTransfer_v2 -{ - static constexpr index_t nDim = SliceLengths::Size(); - - using Index = MultiIndex; - - using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - - using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); - - __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc, - const Index& src_slice_origin_idx) - : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx)) - { - static_assert(DstDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc need to known at compile-time"); - } - - __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) - { - src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); - } - - template - __device__ void Run(const SrcDesc& src_desc, - const SrcBuffer& src_buf, - const DstDesc&, - const DstSliceOriginIdx&, - DstBuffer& dst_buf, - const SrcStepHacks& src_step_hacks) - { - static_assert(DstDesc::IsKnownAtCompileTime(), - "wrong! DstDesc need to known at compile-time"); - - static_assert(is_known_at_compile_time>::value, - "wrong! DstSliceOrigin need to known at compile-time"); - - static_assert( - is_same, remove_cvref_t>::value && - "wrong! inconsistent type"); - - // DstDesc and dst_slice_origin_idx are known at compile-time - constexpr auto dst_desc = remove_cvref_t{}; - constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{}; - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto src_scalar_step_in_vector = - generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - - constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // make forward steps - const auto src_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, forward_step_idx, src_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto src_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, backward_step_idx, src_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_idx[I0]; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] - ? ordered_access_idx[i] - : ordered_access_lengths[i] - 1 - ordered_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - src_scalar_per_access; - }(); - - typename vector_type_maker::type src_vector; - - using src_vector_t = - typename vector_type_maker::type::type; - - const bool is_src_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); - - // copy data from src_buf into src_vector - src_vector.template AsType()(Number<0>{}) = - src_buf.template Get(src_coord_.GetOffset(), is_src_valid); - - // copy data from src_vector into dst_buf - static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { - constexpr index_t dst_offset = - dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx + - i * src_scalar_step_in_vector); - - dst_buf(Number{}) = src_vector.template AsType()[i]; - }); - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - src_desc, src_coord_, src_forward_steps[dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - src_desc, src_coord_, src_backward_steps[dim_access_order[i]]); - } - } - }); - }); - - // move src coordinate back to slice origin (or not) - if constexpr(SrcResetCoordinateAfterRun) - { - const auto src_reset_step = - make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep()); - - move_tensor_coordinate(src_desc, src_coord_, src_reset_step); - } - } - - template - __device__ void Run(const SrcDesc& src_desc, - const SrcBuffer& src_buf, - const DstDesc&, - const DstSliceOriginIdx&, - DstBuffer& dst_buf) - { - constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto src_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_step_hacks); - } - - __device__ static constexpr auto GetSrcCoordinateResetStep() - { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_lengths[I0] - 1; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index after last iteration in Run(), if it has not being reset by - // RunWrite() - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - src_scalar_per_access; - }(); - - // - constexpr auto reset_src_data_step = [&]() { - Index reset_src_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); - - return reset_src_data_step_; - }(); - - return reset_src_data_step; - } - - // dst_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, - const Index& src_slice_origin_step_idx) - { - // if src coord was not reset by Run(), then need to adjust the step here - const auto adjusted_step_idx = - SrcResetCoordinateAfterRun ? src_slice_origin_step_idx - : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx); - - move_tensor_coordinate(src_desc, src_coord_, adjusted_step); - } - - private: - SrcCoord src_coord_; -}; // namespace ck - -// Assume: -// 1. src_desc and dst_desc are not known at compile-time -// 2. SrcBuffer and DstBuffer are DynamicBuffer -// 3. src_slice_origin and dst_slice_origin are not known at compile-time, -// 4. Use thread buffer -template // control whether to move back dst coordinate after each - // RunWrite(), will be fused with MoveDstSliceWindow to - // save addr computation -struct ThreadwiseTensorSliceTransfer_v3 -{ - static constexpr index_t nDim = SliceLengths::Size(); - using Index = MultiIndex; - - using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - - using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); - using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); - - __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc, - const Index& src_slice_origin, - const DstDesc& dst_desc, - const Index& dst_slice_origin) - : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), - dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) - { - } - - __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) - { - src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); - } - - __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) - { - dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); - } - - template - __device__ void - RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks) - { - static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, - "wrong!"); - - static_assert( - is_same, remove_cvref_t>::value, - "wrong! SrcBuffer and SrcData data type are inconsistent"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto src_scalar_step_in_vector = - generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - - constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - - constexpr auto src_dim_access_order = SrcDimAccessOrder{}; - - constexpr auto ordered_src_access_lengths = - container_reorder_given_new2old(src_access_lengths, src_dim_access_order); - - // make forward steps - const auto src_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, forward_step_idx, src_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto src_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, backward_step_idx, src_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_src_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_src_access_idx[I0]; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i] - : ordered_src_access_lengths[i] - 1 - - ordered_src_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * - src_scalar_per_access; - }(); - - vector_type_maker_t src_tmp_vector; - - using src_vector_t = typename decltype(src_tmp_vector)::type; - - const bool is_src_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); - - // copy data from src_buf to src_tmp_vector - src_tmp_vector.template AsType()(Number<0>{}) = - src_buf.template Get(src_coord_.GetOffset(), is_src_valid); - - // copy data from src_tmp_vector to buffer_ - static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { - constexpr index_t buffer_offset = - buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector); - - buffer_(Number{}) = src_tmp_vector.template AsType()[i]; - }); - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= - ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]); - } - } - }); - }); - - // move src coordinate back to slice origin (or not) - if constexpr(SrcResetCoordinateAfterRun) - { - const auto src_reset_step = - make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep()); - - move_tensor_coordinate(src_desc, src_coord_, src_reset_step); - } - } - - template - __device__ void - RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks) - { - static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, - "wrong!"); - - static_assert( - is_same, remove_cvref_t>::value, - "wrong! SrcBuffer or DstBuffer data type is wrong"); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - // src scalar per access on each dim - // TODO: don't use this - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto dst_scalar_step_in_vector = - generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - - constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dst_dim_access_order = DstDimAccessOrder{}; - - constexpr auto ordered_dst_access_lengths = - container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); - - // make forward steps - const auto dst_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, forward_step_idx, dst_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto dst_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, backward_step_idx, dst_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_dst_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_dst_access_idx[I0]; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i] - : ordered_dst_access_lengths[i] - 1 - - ordered_dst_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * - dst_scalar_per_access; - }(); - - vector_type_maker_t dst_tmp_vector; - - // copy data from buffer_ to dst_tmp_vector - static_for<0, DstScalarPerVector, 1>{}([&](auto i) { - constexpr index_t buffer_offset = - buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector); - - dst_tmp_vector.template AsType()(i) = - type_convert{}(buffer_[Number{}]); - }); - - using dst_vector_t = typename decltype(dst_tmp_vector)::type; - - // copy data from dst_tmp_vector to dst_buf - const bool is_dst_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); - - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_tmp_vector.template AsType()[Number<0>{}]); - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= - ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]); - } - } - }); - }); - - // move dst coordinate back to slice origin (or not) - if constexpr(DstResetCoordinateAfterRun) - { - const auto dst_reset_step = - make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); - - move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); - } - } - - template - __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf) - { - constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto src_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - RunRead(src_desc, src_buf, src_step_hacks); - } - - template - __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf) - { - constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto dst_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - RunWrite(dst_desc, dst_buf, dst_step_hacks); - } - - __device__ static constexpr auto GetSrcCoordinateResetStep() - { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto src_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - - constexpr auto src_dim_access_order = SrcDimAccessOrder{}; - - constexpr auto ordered_src_access_lengths = - container_reorder_given_new2old(src_access_lengths, src_dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_src_access_lengths[I0] - 1; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index after last iteration in RunRead(), if it has not being reset by - // RunRead() - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * - src_scalar_per_access; - }(); - - // - constexpr auto reset_src_data_step = [&]() { - Index reset_src_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); - - return reset_src_data_step_; - }(); - - return reset_src_data_step; - } - - __device__ static constexpr auto GetDstCoordinateResetStep() - { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access - constexpr auto dst_scalar_per_access = generate_sequence( - detail::lambda_scalar_per_access{}, Number{}); - - constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; - - constexpr auto dst_dim_access_order = DstDimAccessOrder{}; - - constexpr auto ordered_dst_access_lengths = - container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_dst_access_lengths[I0] - 1; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index after last iteration in RunWrite(), if it has not being reset by - // RunWrite() - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * - dst_scalar_per_access; - }(); - - // - constexpr auto reset_dst_data_step = [&]() { - Index reset_dst_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); - - return reset_dst_data_step_; - }(); - - return reset_dst_data_step; - } - - // src_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, - const Index& src_slice_origin_step_idx) - { - // if src coord was not reset by RunRead(), then need to adjust the step here - const auto adjusted_step_idx = - SrcResetCoordinateAfterRun ? src_slice_origin_step_idx - : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx); - - move_tensor_coordinate(src_desc, src_coord_, adjusted_step); - } - - // src_slice_origin_step_idx need to be known at compile-time, for performance reason - template - __device__ void - MoveSrcSliceWindow(const SrcDesc& src_desc, - const Index& src_slice_origin_step_idx, - const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack) - { - // if src coord was not reset by RunRead(), then need to adjust the step here - const auto adjusted_step_idx = - SrcResetCoordinateAfterRun ? src_slice_origin_step_idx - : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step( - src_desc, adjusted_step_idx, src_move_slice_window_step_hack); - - move_tensor_coordinate(src_desc, src_coord_, adjusted_step); - } - // dst_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, - const Index& dst_slice_origin_step_idx) - { - // if dst coord was not reset by RunWrite(), then need to adjust the step here - const auto adjusted_step_idx = - DstResetCoordinateAfterRun ? dst_slice_origin_step_idx - : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx); - - move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); - } - - private: - static constexpr auto buffer_desc_ = - make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); - - static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize(); - - StaticBuffer buffer_; - - SrcCoord src_coord_; - DstCoord dst_coord_; -}; - -// Assume: -// 1. src: -// 1. SrcDesc is known at compile-time -// 2. SrcBuffer is DynamicBuffer -// 3. src_ref_idx is known at run-time -// 4. SrcRefToOriginDisplacement is known at compile-time -// 5. use #-step -// 2. dst: -// 1. DstDesc is known at compile-time -// 2. DstBuffer is StaticBuffer -// 3. DstOriginIdx is known at compile-time -// 4. use direct address calculation -// 3. vector access on src -template ::type = false> -struct ThreadwiseTensorSliceTransfer_v4 -{ - static constexpr index_t nDim = SliceLengths::Size(); - - using Index = MultiIndex; - - using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - - using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); - - __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx) - : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx)) - { - static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc and DstDesc need to known at compile-time"); - - static_assert(SliceLengths::At(Number{}) % SrcScalarPerVector == 0, "wrong!"); - } - - template - __device__ void Run(const SrcDesc&, - const SrcRefToOriginDisplacement&, - const SrcBuffer& src_buf, - const DstDesc&, - const DstOriginIdx&, - DstBuffer& dst_buf) const - { - static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc and DstDesc need to known at compile-time"); - - static_assert( - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value, - "wrong! SrcBuffer or DstBuffer data type is wrong"); - - static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer"); - - static_assert(is_known_at_compile_time>::value && - is_known_at_compile_time>::value, - "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known " - "at compile-time"); - - // SrcDesc and DstDesc are known at compile-time - constexpr auto src_desc = remove_cvref_t{}; - constexpr auto dst_desc = remove_cvref_t{}; - - // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time - constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{}); - constexpr auto dst_origin_idx = to_multi_index(DstOriginIdx{}); - - // scalar per access of each dim - constexpr auto src_scalar_per_access = generate_sequence_v2( - [&](auto i) constexpr { - if constexpr(i == SrcVectorDim) - { - return Number{}; - } - else - { - return Number<1>{}; - } - }, - Number{}); - - // scalar step (if steping on SrcVectorDim) of each dim - constexpr auto src_scalar_step_in_vector = generate_sequence_v2( - [&](auto i) constexpr { - if constexpr(i == SrcVectorDim) - { - return Number<1>{}; - } - else - { - return Number<0>{}; - } - }, - Number{}); - - constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - static_ford{}([&](auto ordered_access_idx) { -#if 0 - // TODO: unable to compile - // position in slice window - constexpr auto data_to_origin_disp_idx = - container_reorder_given_old2new(ordered_access_idx, dim_access_order) * - src_scalar_per_access; -#else - // position in slice window - constexpr auto data_to_origin_disp_idx = - ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access; -#endif - // src coordinate - constexpr auto src_ref_to_data_disp_idx = - src_ref_to_origin_disp_idx + data_to_origin_disp_idx; - - constexpr auto src_ref_to_data_disp_coord_step = - make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx); - - auto src_data_coord = src_ref_coord_; - - move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step); - - vector_type_maker_t src_tmp_vector; - - using src_vector_t = typename decltype(src_tmp_vector)::type; - - const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid( - src_desc, src_data_coord); - - // copy data from src_buf into src_tmp_vector - src_tmp_vector.template AsType()(Number<0>{}) = - src_buf.template Get(src_data_coord.GetOffset(), is_src_valid); - - // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to - // DstData) - vector_type_maker_t dst_tmp_vector; - - // TODO: if SrcData and DstData are vetor type, then static_cast may not compile - static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { - dst_tmp_vector.template AsType()(i) = - type_convert{}(src_tmp_vector.template AsType()[i]); - }); - - // copy data from dst_tmp_vector into dst_buf - static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { - constexpr index_t dst_offset = dst_desc.CalculateOffset( - dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector); - - dst_buf(Number{}) = dst_tmp_vector.template AsType()[i]; - }); - }); - } - - template - __device__ void MoveSrcSliceWindow(const SrcDesc&, - const SrcSliceMoveStepIdx& src_slice_move_step_idx) - { - constexpr auto src_desc = SrcDesc{}; - - const auto src_slice_move_step_iter = - make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx)); - - move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter); - } - - private: - SrcCoord src_ref_coord_; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp deleted file mode 100644 index bbdaa5fa2bcd15f3951f57123d7cc205ad3e2d4b..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp +++ /dev/null @@ -1,776 +0,0 @@ -#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP -#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V2_HPP - -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" - -namespace ck { - -// Assume: -// 1. src_desc and dst_desc are not known at compile-time -// 2. SrcBuffer and DstBuffer are DynamicBuffer -// 3. src_slice_origin and dst_slice_origin are not known at compile-time, -// 4. Use thread buffer -template // control whether to move back dst coordinate after each - // RunWrite(), will be fused with MoveDstSliceWindow to - // save addr computation -struct ThreadwiseTensorSliceTransfer_v3r1 -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - static constexpr index_t nDim = SliceLengths::Size(); - using Index = MultiIndex; - - using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - - using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); - using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); - - __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(const SrcDesc& src_desc, - const Index& src_slice_origin, - const DstDesc& dst_desc, - const Index& dst_slice_origin) - : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), - dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)) - { - // TODO: fix this - static_assert(is_same::value, - "wrong! current implementation assume SrcData and DstData are same type"); - - static_for<0, nDim, 1>{}([](auto i) { - static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0 && - SliceLengths::At(i) % DstVectorTensorLengths::At(i) == 0, - "wrong!"); - }); - } - - __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) - { - src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); - } - - __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) - { - dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); - } - - template - __device__ void - RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks) - { - static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, - "wrong!"); - - static_assert( - is_same, remove_cvref_t>::value, - "wrong! SrcBuffer and SrcData data type are inconsistent"); - - // tensor descriptor for src_vector - constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{}; - - constexpr auto src_vector_tensor_strides = container_reorder_given_old2new( - container_reverse_exclusive_scan( - container_reorder_given_new2old(src_vector_tensor_lengths, - SrcVectorTensorContiguousDimOrder{}), - math::multiplies{}, - I1), - SrcVectorTensorContiguousDimOrder{}); - - constexpr auto src_vector_desc = - make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths), - sequence_to_tuple_of_number(src_vector_tensor_strides)); - - // access order and lengths - constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths; - - constexpr auto src_dim_access_order = SrcDimAccessOrder{}; - - constexpr auto ordered_src_access_lengths = - container_reorder_given_new2old(src_access_lengths, src_dim_access_order); - - // make forward steps - const auto src_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, forward_step_idx, src_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto src_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, backward_step_idx, src_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_src_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_src_access_idx[I0]; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i] - : ordered_src_access_lengths[i] - 1 - - ordered_src_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * - src_vector_tensor_lengths; - }(); - - vector_type_maker_t src_vector; - - using src_vector_t = typename decltype(src_vector)::type; - - const bool is_src_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); - - // copy data from src_buf to src_vector - src_vector.template AsType()(I0) = - src_buf.template Get(src_coord_.GetOffset(), is_src_valid); - - // copy data from src_vector to buffer_ - static_ford{}([&](auto src_vector_idx_) { - constexpr auto src_vector_idx = to_multi_index(src_vector_idx_); - - constexpr index_t src_vector_offset = - src_vector_desc.CalculateOffset(src_vector_idx); - - constexpr index_t buffer_offset = - buffer_desc_.CalculateOffset(src_data_idx + src_vector_idx); - - buffer_(Number{}) = - src_vector.template AsType()[Number{}]; - }); - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= - ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]); - } - } - }); - }); - - // move src coordinate back to slice origin (or not) - if constexpr(SrcResetCoordinateAfterRun) - { - const auto src_reset_step = - make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep()); - - move_tensor_coordinate(src_desc, src_coord_, src_reset_step); - } - } - - template - __device__ void - RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks) - { - static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Global or - DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, - "wrong!"); - - static_assert( - is_same, remove_cvref_t>::value, - "wrong! SrcBuffer or DstBuffer data type is wrong"); - - // tensor descriptor for dst_vector - constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{}; - - constexpr auto dst_vector_tensor_strides = container_reorder_given_old2new( - container_reverse_exclusive_scan( - container_reorder_given_new2old(dst_vector_tensor_lengths, - DstVectorTensorContiguousDimOrder{}), - math::multiplies{}, - I1), - DstVectorTensorContiguousDimOrder{}); - - constexpr auto dst_vector_desc = - make_naive_tensor_descriptor(sequence_to_tuple_of_number(dst_vector_tensor_lengths), - sequence_to_tuple_of_number(dst_vector_tensor_strides)); - - // dst access order and lengths - constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths; - - constexpr auto dst_dim_access_order = DstDimAccessOrder{}; - - constexpr auto ordered_dst_access_lengths = - container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); - - // make forward steps - const auto dst_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, forward_step_idx, dst_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto dst_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, backward_step_idx, dst_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_dst_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_dst_access_idx[I0]; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i] - : ordered_dst_access_lengths[i] - 1 - - ordered_dst_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * - dst_vector_tensor_lengths; - }(); - - vector_type_maker_t dst_vector; - - // copy data from buffer_ to dst_vector (also cast from SrcData to DstData) - static_ford{}([&](auto dst_vector_idx_) { - constexpr auto dst_vector_idx = to_multi_index(dst_vector_idx_); - - constexpr index_t buffer_offset = - buffer_desc_.CalculateOffset(dst_data_idx + dst_vector_idx); - - constexpr index_t dst_vector_offset = - dst_vector_desc.CalculateOffset(dst_vector_idx); - - dst_vector.template AsType()(Number{}) = - type_convert{}(buffer_[Number{}]); - }); - - using dst_vector_t = typename decltype(dst_vector)::type; - - // copy data from dst_vector to dst_buf - const bool is_dst_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); - - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= - ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]); - } - } - }); - }); - - // move dst coordinate back to slice origin (or not) - if constexpr(DstResetCoordinateAfterRun) - { - const auto dst_reset_step = - make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); - - move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); - } - } - - template - __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf) - { - constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto src_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - RunRead(src_desc, src_buf, src_step_hacks); - } - - template - __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf) - { - constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto dst_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - RunWrite(dst_desc, dst_buf, dst_step_hacks); - } - - __device__ static constexpr auto GetSrcCoordinateResetStep() - { - constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{}; - - constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths; - - constexpr auto src_dim_access_order = SrcDimAccessOrder{}; - - constexpr auto ordered_src_access_lengths = - container_reorder_given_new2old(src_access_lengths, src_dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_src_access_lengths[I0] - 1; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index after last iteration in RunRead(), if it has not being reset by - // RunRead() - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * - src_vector_tensor_lengths; - }(); - - // - constexpr auto reset_src_data_step = [&]() { - Index reset_src_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); - - return reset_src_data_step_; - }(); - - return reset_src_data_step; - } - - __device__ static constexpr auto GetDstCoordinateResetStep() - { - constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{}; - - constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths; - - constexpr auto dst_dim_access_order = DstDimAccessOrder{}; - - constexpr auto ordered_dst_access_lengths = - container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_dst_access_lengths[I0] - 1; - - static_for<0, i, 1>{}([&](auto j) { - tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index after last iteration in RunWrite(), if it has not being reset by - // RunWrite() - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * - dst_vector_tensor_lengths; - }(); - - // - constexpr auto reset_dst_data_step = [&]() { - Index reset_dst_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); - - return reset_dst_data_step_; - }(); - - return reset_dst_data_step; - } - - // src_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, - const Index& src_slice_origin_step_idx) - { - // if src coord was not reset by RunRead(), then need to adjust the step here - const auto adjusted_step_idx = - SrcResetCoordinateAfterRun ? src_slice_origin_step_idx - : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx); - - move_tensor_coordinate(src_desc, src_coord_, adjusted_step); - } - - // src_slice_origin_step_idx need to be known at compile-time, for performance reason - template - __device__ void - MoveSrcSliceWindow(const SrcDesc& src_desc, - const Index& src_slice_origin_step_idx, - const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack) - { - // if src coord was not reset by RunRead(), then need to adjust the step here - const auto adjusted_step_idx = - SrcResetCoordinateAfterRun ? src_slice_origin_step_idx - : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step( - src_desc, adjusted_step_idx, src_move_slice_window_step_hack); - - move_tensor_coordinate(src_desc, src_coord_, adjusted_step); - } - // dst_slice_origin_step_idx need to be known at compile-time, for performance reason - __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, - const Index& dst_slice_origin_step_idx) - { - // if dst coord was not reset by RunWrite(), then need to adjust the step here - const auto adjusted_step_idx = - DstResetCoordinateAfterRun ? dst_slice_origin_step_idx - : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); - - // is it OK to construct a new step every time? - const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx); - - move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); - } - - private: - static constexpr auto buffer_desc_ = - make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{})); - - static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize(); - - StaticBuffer buffer_; - - SrcCoord src_coord_; - DstCoord dst_coord_; -}; - -// Assume: -// 1. src: -// 1. SrcDesc is known at compile-time -// 2. SrcBuffer is DynamicBuffer -// 3. src_ref_idx is known at run-time -// 4. SrcRefToOriginDisplacement is known at compile-time -// 5. use #-step -// 2. dst: -// 1. DstDesc is known at compile-time -// 2. DstBuffer is StaticBuffer -// 3. DstOriginIdx is known at compile-time -// 4. use direct address calculation -// 3. vector access on src -template ::type = false> -struct ThreadwiseTensorSliceTransfer_v4r1 -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - static constexpr index_t nDim = SliceLengths::Size(); - - using Index = MultiIndex; - - using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - - using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); - - __device__ constexpr ThreadwiseTensorSliceTransfer_v4r1(const Index& src_ref_idx) - : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx)) - { - static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc and DstDesc need to known at compile-time"); - - static_for<0, nDim, 1>{}([](auto i) { - static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0, "wrong!"); - }); - } - - template - __device__ void Run(const SrcDesc&, - const SrcRefToOriginDisplacement&, - const SrcBuffer& src_buf, - const DstDesc&, - const DstOriginIdx&, - DstBuffer& dst_buf) const - { - static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), - "wrong! SrcDesc and DstDesc need to known at compile-time"); - - static_assert( - is_same, remove_cvref_t>::value && - is_same, remove_cvref_t>::value, - "wrong! SrcBuffer or DstBuffer data type is wrong"); - - static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer"); - - static_assert(is_known_at_compile_time>::value && - is_known_at_compile_time>::value, - "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known " - "at compile-time"); - - // SrcDesc and DstDesc are known at compile-time - constexpr auto src_desc = remove_cvref_t{}; - constexpr auto dst_desc = remove_cvref_t{}; - - // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time - constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{}); - constexpr auto dst_origin_idx = to_multi_index(DstOriginIdx{}); - - // tensor descriptor for src_vector - constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{}; - - constexpr auto src_vector_tensor_strides = container_reorder_given_old2new( - container_reverse_exclusive_scan( - container_reorder_given_new2old(src_vector_tensor_lengths, - SrcVectorTensorContiguousDimOrder{}), - math::multiplies{}, - I1), - SrcVectorTensorContiguousDimOrder{}); - - constexpr auto src_vector_desc = - make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths), - sequence_to_tuple_of_number(src_vector_tensor_strides)); - - // access order and lengths - constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - static_ford{}([&](auto ordered_access_idx) { - // position in slice window - constexpr auto data_to_origin_disp_idx = - ordered_access_idx.ReorderGivenOld2New(dim_access_order) * - src_vector_tensor_lengths; - - // src coordinate at starting point of src_vector - constexpr auto src_ref_to_data_disp_idx = - src_ref_to_origin_disp_idx + data_to_origin_disp_idx; - - constexpr auto src_ref_to_data_disp_coord_step = - make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx); - - auto src_data_coord = src_ref_coord_; - - move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step); - - vector_type_maker_t src_vector; - - using src_vector_t = typename decltype(src_vector)::type; - - const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid( - src_desc, src_data_coord); - - // copy data from src_buf into src_vector - src_vector.template AsType()(I0) = - src_buf.template Get(src_data_coord.GetOffset(), is_src_valid); - - // copy data from src_vector into dst_buf (also cast from SrcData to DstData) - static_ford{}([&](auto src_vector_idx_) { - constexpr auto src_vector_idx = to_multi_index(src_vector_idx_); - - constexpr index_t src_vector_offset = - src_vector_desc.CalculateOffset(src_vector_idx); - - constexpr index_t dst_offset = dst_desc.CalculateOffset( - dst_origin_idx + data_to_origin_disp_idx + src_vector_idx); - - dst_buf(Number{}) = type_convert{}( - src_vector.template AsType()[Number{}]); - }); - }); - } - - template - __device__ void MoveSrcSliceWindow(const SrcDesc&, - const SrcSliceMoveStepIdx& src_slice_move_step_idx) - { - constexpr auto src_desc = SrcDesc{}; - - const auto src_slice_move_step_iter = - make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx)); - - move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter); - } - - private: - SrcCoord src_ref_coord_; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/tensor_operation/xdlops_gemm.hpp b/composable_kernel/include/tensor_operation/xdlops_gemm.hpp deleted file mode 100644 index 10633f8f328787e00565677fb73ef7804d9e23d1..0000000000000000000000000000000000000000 --- a/composable_kernel/include/tensor_operation/xdlops_gemm.hpp +++ /dev/null @@ -1,783 +0,0 @@ -#ifndef CK_XDLOPS_GEMM_HPP -#define CK_XDLOPS_GEMM_HPP - -#include "common_header.hpp" -#include "math.hpp" -#include "amd_xdlops.hpp" - -namespace ck { - -enum struct MfmaInstr -{ - mfma_f32_32x32x1xf32 = 0, - mfma_f32_16x16x1xf32, - mfma_f32_4x4x1xf32, - mfma_f32_32x32x2xf32, // k reduction - mfma_f32_16x16x4xf32, // k reduction - mfma_f32_32x32x4f16, - mfma_f32_16x16x4f16, - mfma_f32_4x4x4f16, - mfma_f32_32x32x8f16, // k reduction - mfma_f32_16x16x16f16, // k reduction - mfma_f32_32x32x2bf16, - mfma_f32_16x16x2bf16, - mfma_f32_4x4x2bf16, - mfma_f32_32x32x4bf16, // k reduction - mfma_f32_16x16x8bf16, // k reduction -}; - -template -struct mfma_type; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 4; - static constexpr index_t num_regs_per_blk = 16; - static constexpr index_t num_threads_per_blk = 32; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 2; - static constexpr index_t num_output_blks = 2; - static constexpr index_t m_per_blk = 32; - static constexpr index_t n_per_blk = 32; - static constexpr index_t k_per_blk = 1; - static constexpr bool is_k_reduction = false; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_32x32x1f32::Run(a, b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 4; - static constexpr index_t num_regs_per_blk = 16; - static constexpr index_t num_threads_per_blk = 32; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 2; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 32; - static constexpr index_t n_per_blk = 32; - static constexpr index_t k_per_blk = 1; - static constexpr bool is_k_reduction = true; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_32x32x2f32::Run(a, b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; - static constexpr index_t num_threads_per_blk = 16; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 4; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 16; - static constexpr index_t n_per_blk = 16; - static constexpr index_t k_per_blk = 1; - static constexpr bool is_k_reduction = true; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_16x16x4f32::Run(a, b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; - static constexpr index_t num_threads_per_blk = 16; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 4; - static constexpr index_t num_output_blks = 4; - static constexpr index_t m_per_blk = 16; - static constexpr index_t n_per_blk = 16; - static constexpr index_t k_per_blk = 1; - static constexpr bool is_k_reduction = false; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_16x16x1f32::Run(a, b, reg_c); - } -}; - -// treat 4x4x1 as a single-blk 4x64 mfma -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; - static constexpr index_t num_threads_per_blk = 64; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 1; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 4; - static constexpr index_t n_per_blk = 64; - static constexpr index_t k_per_blk = 1; - static constexpr bool is_k_reduction = false; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_4x4x1f32::Run(a, b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 4; - static constexpr index_t num_regs_per_blk = 16; - static constexpr index_t num_threads_per_blk = 32; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 2; - static constexpr index_t num_output_blks = 2; - static constexpr index_t m_per_blk = 32; - static constexpr index_t n_per_blk = 32; - static constexpr index_t k_per_blk = 4; - static constexpr bool is_k_reduction = false; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_32x32x4f16::Run(a, b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 4; - static constexpr index_t num_regs_per_blk = 16; - static constexpr index_t num_threads_per_blk = 32; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 2; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 32; - static constexpr index_t n_per_blk = 32; - static constexpr index_t k_per_blk = 4; - static constexpr bool is_k_reduction = true; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_32x32x8f16::Run(a, b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; - static constexpr index_t num_threads_per_blk = 16; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 4; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 16; - static constexpr index_t n_per_blk = 16; - static constexpr index_t k_per_blk = 4; - static constexpr bool is_k_reduction = true; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_16x16x16f16::Run(a, b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; - static constexpr index_t num_threads_per_blk = 16; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 4; - static constexpr index_t num_output_blks = 4; - static constexpr index_t m_per_blk = 16; - static constexpr index_t n_per_blk = 16; - static constexpr index_t k_per_blk = 4; - static constexpr bool is_k_reduction = false; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_16x16x4f16::Run(a, b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; - static constexpr index_t num_threads_per_blk = 64; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 1; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 4; - static constexpr index_t n_per_blk = 64; - static constexpr index_t k_per_blk = 4; - static constexpr bool is_k_reduction = false; - - template - __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const - { - intrin_mfma_f32_4x4x4f16::Run(a, b, reg_c); - } -}; - -#if 0 -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 4; - static constexpr index_t num_regs_per_blk = 16; - static constexpr index_t num_threads_per_blk = 32; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 2; - static constexpr index_t num_output_blks = 2; - static constexpr index_t m_per_blk = 32; - static constexpr index_t n_per_blk = 32; - static constexpr index_t k_per_blk = 2; - static constexpr bool is_k_reduction = false; - - template - __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const - { - const auto p_a = c_style_pointer_cast(a); - const auto p_b = c_style_pointer_cast(b); - - return intrin_mfma_f32_32x32x2bf16::run( - p_a, p_b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 4; - static constexpr index_t num_regs_per_blk = 16; - static constexpr index_t num_threads_per_blk = 32; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 2; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 32; - static constexpr index_t n_per_blk = 32; - static constexpr index_t k_per_blk = 2; - static constexpr bool is_k_reduction = true; - - template - __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const - { - const auto p_a = c_style_pointer_cast(a); - const auto p_b = c_style_pointer_cast(b); - - return intrin_mfma_f32_32x32x4bf16(p_a, p_b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; - static constexpr index_t num_threads_per_blk = 16; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 4; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 16; - static constexpr index_t n_per_blk = 16; - static constexpr index_t k_per_blk = 2; - static constexpr bool is_k_reduction = true; - - template - __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const - { - const auto p_a = c_style_pointer_cast(a); - const auto p_b = c_style_pointer_cast(b); - - return intrin_mfma_f32_16x16x8bf16(p_a, p_b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; - static constexpr index_t num_threads_per_blk = 16; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 4; - static constexpr index_t num_output_blks = 4; - static constexpr index_t m_per_blk = 16; - static constexpr index_t n_per_blk = 16; - static constexpr index_t k_per_blk = 2; - static constexpr bool is_k_reduction = false; - - template - __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const - { - const auto p_a = c_style_pointer_cast(a); - const auto p_b = c_style_pointer_cast(b); - - return intrin_mfma_f32_16x16x2bf16(p_a, p_b, reg_c); - } -}; - -template <> -struct mfma_type -{ - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; - static constexpr index_t num_threads_per_blk = 64; - static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 1; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 4; - static constexpr index_t n_per_blk = 64; - static constexpr index_t k_per_blk = 2; - static constexpr bool is_k_reduction = false; - - template - __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const - { - const auto p_a = c_style_pointer_cast(a); - const auto p_b = c_style_pointer_cast(b); - - return intrin_mfma_f32_4x4x2bf16::run(p_a, p_b, reg_c); - } -}; -#endif - -template -struct MfmaSelector -{ - template - static constexpr auto GetMfma(); - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_32x32x1xf32; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_32x32x1xf32; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_16x16x1xf32; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_4x4x1xf32; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_4x4x1xf32; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_32x32x2xf32; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_16x16x4xf32; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_32x32x4f16; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_32x32x4f16; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_32x32x8f16; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_16x16x16f16; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_16x16x4f16; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_4x4x4f16; - } - - template <> - static constexpr auto GetMfma() - { - return MfmaInstr::mfma_f32_4x4x4f16; - } - -#if 0 - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } - - template <> - static constexpr auto GetMfma() - { - return xdlops_info{}; - } -#endif - - static constexpr auto selected_mfma = mfma_type()>{}; - - __host__ __device__ static constexpr void mfma_check() - { - static_assert(selected_mfma.group_size * selected_mfma.num_groups_per_blk == - selected_mfma.num_regs_per_blk, - "wrong! num_regs_per_blk"); - - static_assert(selected_mfma.num_threads_per_blk == selected_mfma.n_per_blk, - "n_per_blk != num_threads_per_blk"); - - static_assert(selected_mfma.num_regs_per_blk * selected_mfma.num_input_blks == - selected_mfma.m_per_blk, - "m_per_blk != num_input_blks * num_regs_per_blk"); - - static_assert(selected_mfma.num_output_blks == selected_mfma.num_input_blks || - selected_mfma.num_output_blks == 1, - "incorrect num_output_blks"); - - static_assert(selected_mfma.num_regs_per_blk * selected_mfma.wave_size == - selected_mfma.m_per_blk * selected_mfma.n_per_blk, - "num_regs_per_blk incorrect"); - - static_assert(selected_mfma.is_k_reduction || - (selected_mfma.num_input_blks == selected_mfma.num_output_blks), - "is_k_reduction wrong!"); - } - - __host__ __device__ constexpr MfmaSelector() { mfma_check(); } - - static constexpr bool IsABroadcast() - { - static_assert(NPerXdlops >= MPerXdlops, "only support ABroadcast"); - return true; - } - - static constexpr index_t GetKPerXdlops() - { - return (selected_mfma.is_k_reduction ? selected_mfma.num_input_blks : 1) * - selected_mfma.k_per_blk; - } - - static constexpr index_t GetKPerThread() { return selected_mfma.k_per_blk; } -}; - -template -struct XdlopsGemm -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; - static constexpr auto I5 = Number<5>{}; - - using CIndex = MultiIndex<2>; - - __device__ static constexpr index_t GetNumBlks() { return mfma_instr.num_output_blks; } - - __device__ static constexpr index_t GetNumXdlops() - { - return MPerXdlops * NPerXdlops / - (mfma_instr.m_per_blk * mfma_instr.n_per_blk * mfma_instr.num_output_blks); - } - - __host__ __device__ constexpr XdlopsGemm() - { - static_assert(NPerXdlops == 4 || NPerXdlops == 8 || NPerXdlops == 16 || NPerXdlops == 32 || - NPerXdlops == 64, - "Only support GemmNPerXdlops == 4, 8, 16, 32 or 64 for xdlops"); - - static_assert(MPerXdlops == 4 || MPerXdlops == 8 || MPerXdlops == 16 || MPerXdlops == 32 || - MPerXdlops == 64, - "Only support GemmMPerXdlops == 4, 8, 16, 32 or 64 for xdlops"); - - static_assert(KPack % mfma_instr.k_per_blk == 0, "KPack cannot be divided by k_per_blk"); - } - - template - __host__ __device__ static constexpr auto - MakeCM0N0M1N1M2M3M4N2Descriptor(const CM0N0M1N1M2N2Desc& c_m0_n0_m1_n1_m2_n2_desc) - { - const auto M0 = c_m0_n0_m1_n1_m2_n2_desc.GetLength(I0); - const auto N0 = c_m0_n0_m1_n1_m2_n2_desc.GetLength(I1); - const auto M1 = c_m0_n0_m1_n1_m2_n2_desc.GetLength(I2); - const auto N1 = c_m0_n0_m1_n1_m2_n2_desc.GetLength(I3); - - return transform_tensor_descriptor( - c_m0_n0_m1_n1_m2_n2_desc, - make_tuple(make_pass_through_transform(M0), - make_pass_through_transform(N0), - make_pass_through_transform(M1), - make_pass_through_transform(N1), - make_unmerge_transform(make_tuple(mfma_instr.num_groups_per_blk, - mfma_instr.num_input_blks, - mfma_instr.group_size)), - make_pass_through_transform(mfma_instr.num_threads_per_blk)), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4, 5, 6>{}, - Sequence<7>{})); - } - - __device__ static constexpr index_t GetRegSizePerXdlops() - { - return MPerXdlops * NPerXdlops / mfma_instr.wave_size; - } - - template - __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const - { - static_assert(is_same::value || is_same::value || - is_same::value, - "base base_type must be float, half, ushort!"); - - static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) { - mfma_instr.template run(p_a_wave[k], p_b_wave[k], p_c_thread); - }); - } - - __device__ static auto GetLaneId() { return get_thread_local_1d_id() % mfma_instr.wave_size; } - - __device__ static auto GetBlkIdx() - { - const auto laneId = GetLaneId(); - - constexpr auto threadidx_to_blk_idx_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform( - make_tuple(1, mfma_instr.num_input_blks, mfma_instr.num_threads_per_blk))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - const auto blk_idx = - threadidx_to_blk_idx_adaptor.CalculateBottomIndex(make_multi_index(laneId)); - - const auto blk_id = blk_idx[I1]; - const auto blk_td = blk_idx[I2]; - - return make_tuple(blk_id, blk_td); - } - - __host__ __device__ static auto CalculateAThreadOriginDataIndex() - { - const auto laneId = GetLaneId(); - const auto blk_idx = GetBlkIdx(); - - const auto blk_id = blk_idx[I0]; - const auto blk_td = blk_idx[I1]; - - if constexpr(mfma_instr.is_k_reduction) - { - return make_tuple(blk_id, blk_td); - } - else - { - return make_tuple(0, laneId); - } - } - - __host__ __device__ static auto CalculateBThreadOriginDataIndex() - { - const auto laneId = GetLaneId(); - const auto blk_idx = GetBlkIdx(); - - const auto blk_id = blk_idx[I0]; - const auto blk_td = blk_idx[I1]; - - if constexpr(mfma_instr.is_k_reduction) - { - return make_tuple(blk_id, blk_td); - } - else - { - return make_tuple(0, laneId); - } - } - - __device__ static CIndex GetBeginOfThreadBlk(index_t xdlops_i, index_t blk_i) - { - const auto blk_idx = GetBlkIdx(); - - const auto blk_id = blk_idx[I0]; - const auto blk_td = blk_idx[I1]; - - index_t n_offset = blk_i * mfma_instr.n_per_blk + blk_td; - index_t m_offset = xdlops_i * mfma_instr.m_per_blk + blk_id * mfma_instr.group_size; - - return CIndex{m_offset, n_offset}; - } - - static constexpr auto mfma = MfmaSelector{}; - - static constexpr auto mfma_instr = mfma.selected_mfma; - - static constexpr auto KPerXdlops = mfma.GetKPerXdlops(); - static constexpr auto K1PerXdlops = mfma.GetKPerThread(); - static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops; - - __host__ __device__ static constexpr auto GetCM0M1M2NThreadBlkLengths() - { - return make_tuple( - Number{}, I1, Number{}, I1); - } -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/amd_address_space.hpp b/composable_kernel/include/utility/amd_address_space.hpp deleted file mode 100644 index 24c95b27af07b33d96741adf57de256a6ee7a008..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/amd_address_space.hpp +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef CK_AMD_ADDRESS_SPACE_HPP -#define CK_AMD_ADDRESS_SPACE_HPP - -#include "config.hpp" -#include "c_style_pointer_cast.hpp" - -// Address Space for AMDGCN -// https://llvm.org/docs/AMDGPUUsage.html#address-space - -namespace ck { - -enum AddressSpaceEnum_t -{ - Generic, - Global, - Lds, - Sgpr, - Vgpr, -}; - -template -__device__ T* cast_pointer_to_generic_address_space(T CONSTANT* p) -{ - // cast a pointer in "Constant" address space (4) to "Generic" address space (0) - // only c-style pointer cast seems be able to be compiled -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" - return (T*)p; // NOLINT(old-style-cast) -#pragma clang diagnostic pop -} - -template -__host__ __device__ T CONSTANT* cast_pointer_to_constant_address_space(T* p) -{ - // cast a pointer in "Generic" address space (0) to "Constant" address space (4) - // only c-style pointer cast seems be able to be compiled -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" - return (T CONSTANT*)p; // NOLINT(old-style-cast) -#pragma clang diagnostic pop -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/amd_buffer_addressing.hpp b/composable_kernel/include/utility/amd_buffer_addressing.hpp deleted file mode 100644 index 3df53bda443a1c233d19ee72776933bbfd6e8069..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/amd_buffer_addressing.hpp +++ /dev/null @@ -1,897 +0,0 @@ -#ifndef CK_AMD_BUFFER_ADDRESSING_HPP -#define CK_AMD_BUFFER_ADDRESSING_HPP - -#include "data_type.hpp" - -namespace ck { - -template -union BufferResource -{ - // 128 bit SGPRs to supply buffer resource in buffer instructions - // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions - int32x4_t content; - StaticallyIndexedArray address; - StaticallyIndexedArray range; - StaticallyIndexedArray config; -}; - -template -__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size) -{ - BufferResource wave_buffer_resource; - - // wavewise base address (64 bit) - wave_buffer_resource.address(Number<0>{}) = const_cast*>(p_wave); - // wavewise range (32 bit) - wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T); - // wavewise setting (32 bit) - wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD; - - return wave_buffer_resource.content; -} - -// load -__device__ int8_t -llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8"); - -__device__ int8x2_t -llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8"); - -__device__ int8x4_t -llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8"); - -__device__ int16_t -llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32"); -__device__ int32_t -llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32"); - -__device__ int32x2_t -llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32"); - -__device__ int32x4_t -llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32"); -// half -__device__ half_t -llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16"); - -__device__ half2_t -llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16"); - -__device__ half4_t -llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16"); - -// float -__device__ float -llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32"); - -__device__ float2_t -llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32"); - -__device__ float4_t -llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32"); - -// store -__device__ void -llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8"); - -__device__ void -llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8"); - -__device__ void -llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8"); - -__device__ void -llvm_amdgcn_raw_buffer_store_i16(int16_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16"); - -__device__ void -llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32"); - -__device__ void -llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32"); - -__device__ void -llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32"); - -// half -__device__ void -llvm_amdgcn_raw_buffer_store_fp16(half_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16"); - -__device__ void -llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16"); - -__device__ void -llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16"); -// float -__device__ void -llvm_amdgcn_raw_buffer_store_fp32(float vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32"); - -__device__ void -llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32"); - -__device__ void -llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32"); -// atomic add -// int -__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32( - int32_t vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32"); - -// float -__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32( - float vdata, - int32x4_t rsrc, - index_t voffset, - index_t soffset, - index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32"); - -template -__device__ typename vector_type::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, - index_t src_thread_addr_offset, - index_t src_wave_addr_offset) -{ - static_assert( - (is_same::value && (N == 1 || N == 2 || N == 4)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), - "wrong! not implemented"); - - if constexpr(is_same::value) - { - // use fp32 load to mimic fp64 load - if constexpr(N == 1) - { - const float2_t tmp = llvm_amdgcn_raw_buffer_load_fp32x2( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - return as_type(tmp); - } - else if constexpr(N == 2) - { - const float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - return as_type(tmp); - } - else if constexpr(N == 4) - { - const float4_t f32_0 = llvm_amdgcn_raw_buffer_load_fp32x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - const float4_t f32_1 = - llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 4 * sizeof(float), - 0); - vector_type tmp; - - tmp.AsType()(Number<0>{}) = as_type(f32_0); - tmp.AsType()(Number<1>{}) = as_type(f32_1); - - return tmp.AsType()(Number<0>{}); - } - } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - return llvm_amdgcn_raw_buffer_load_fp32( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 2) - { - return llvm_amdgcn_raw_buffer_load_fp32x2( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 4) - { - return llvm_amdgcn_raw_buffer_load_fp32x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 8) - { - vector_type tmp; - - tmp.AsType()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp32x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - tmp.AsType()(Number<1>{}) = - llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 4 * sizeof(float), - 0); - - return tmp.AsType()(Number<0>{}); - } - } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - return llvm_amdgcn_raw_buffer_load_fp16( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 2) - { - return llvm_amdgcn_raw_buffer_load_fp16x2( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 4) - { - return llvm_amdgcn_raw_buffer_load_fp16x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 8) - { - // use fp32 load to mimic fp16 load - float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - return as_type(tmp); - } - } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - return llvm_amdgcn_raw_buffer_load_i32( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 2) - { - return llvm_amdgcn_raw_buffer_load_i32x2( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 4) - { - return llvm_amdgcn_raw_buffer_load_i32x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 8) - { - vector_type tmp; - - tmp.AsType()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i32x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - tmp.AsType()(Number<1>{}) = - llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 4 * sizeof(int32_t), - 0); - return tmp.AsType()(Number<0>{}); - } - } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - return llvm_amdgcn_raw_buffer_load_i8( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - } - else if constexpr(N == 2) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - return llvm_amdgcn_raw_buffer_load_i8x2( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); -#else - int16_t tmp = llvm_amdgcn_raw_buffer_load_i16( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - return as_type(tmp); -#endif - } - else if constexpr(N == 4) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - return llvm_amdgcn_raw_buffer_load_i8x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); -#else - int32_t tmp = llvm_amdgcn_raw_buffer_load_i32( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - return as_type(tmp); -#endif - } - else if constexpr(N == 8) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - vector_type tmp; - - tmp.AsType()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - tmp.AsType()(Number<1>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 4 * sizeof(int8_t), - 0); - - return tmp.AsType()(Number<0>{}); -#else - int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - return as_type(tmp); -#endif - } - else if constexpr(N == 16) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - vector_type tmp; - - tmp.AsType()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - tmp.AsType()(Number<1>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 4 * sizeof(int8_t), - 0); - - tmp.AsType()(Number<2>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 8 * sizeof(int8_t), - 0); - - tmp.AsType()(Number<3>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 12 * sizeof(int8_t), - 0); - - return tmp.AsType()(Number<0>{}); -#else - int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - return as_type(tmp); -#endif - } - } -} - -template -__device__ void amd_buffer_store_impl(const typename vector_type::type src_thread_data, - int32x4_t dst_wave_buffer_resource, - index_t dst_thread_addr_offset, - index_t dst_wave_addr_offset) -{ - static_assert( - (is_same::value && (N == 1 || N == 2)) || - (is_same::value && (N == 1 || N == 2 || N == 4)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || - (is_same::value && (N == 1 || N == 2 || N == 4)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), - "wrong! not implemented"); - - if constexpr(is_same::value) - { - // use fp32 store to mimic fp64 store - if constexpr(N == 1) - { - llvm_amdgcn_raw_buffer_store_fp32x2(as_type(src_thread_data), - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 2) - { - llvm_amdgcn_raw_buffer_store_fp32x4(as_type(src_thread_data), - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - llvm_amdgcn_raw_buffer_store_fp32(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 2) - { - llvm_amdgcn_raw_buffer_store_fp32x2(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 4) - { - llvm_amdgcn_raw_buffer_store_fp32x4(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - llvm_amdgcn_raw_buffer_store_fp16(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 2) - { - llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 4) - { - llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 8) - { - vector_type tmp{src_thread_data}; - - llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType()[Number<0>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - - llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType()[Number<1>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + 4 * sizeof(half_t), - 0); - } - } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - llvm_amdgcn_raw_buffer_store_i32(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 2) - { - llvm_amdgcn_raw_buffer_store_i32x2(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 4) - { - llvm_amdgcn_raw_buffer_store_i32x4(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - llvm_amdgcn_raw_buffer_store_i8(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 2) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - llvm_amdgcn_raw_buffer_store_i8x2(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); -#else - llvm_amdgcn_raw_buffer_store_i16(as_type(src_thread_data), - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); -#endif - } - else if constexpr(N == 4) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - llvm_amdgcn_raw_buffer_store_i8x4(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); -#else - llvm_amdgcn_raw_buffer_store_i32(as_type(src_thread_data), - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); -#endif - } - else if constexpr(N == 8) - { - llvm_amdgcn_raw_buffer_store_i32x2(as_type(src_thread_data), - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 16) - { - llvm_amdgcn_raw_buffer_store_i32x4(as_type(src_thread_data), - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - } -} - -template -__device__ void amd_buffer_atomic_add_impl(const typename vector_type::type src_thread_data, - int32x4_t dst_wave_buffer_resource, - index_t dst_thread_addr_offset, - index_t dst_wave_addr_offset) -{ - static_assert((is_same::value && (N == 1 || N == 2 || N == 4)) || - (is_same::value && (N == 1 || N == 2 || N == 4)), - "wrong! not implemented"); - - if constexpr(is_same::value) - { - if constexpr(N == 1) - { - llvm_amdgcn_raw_buffer_atomic_add_fp32(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 2) - { - vector_type tmp{src_thread_data}; - - llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType()[Number<0>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - - llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType()[Number<1>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + sizeof(float), - 0); - } - else if constexpr(N == 4) - { - vector_type tmp{src_thread_data}; - - llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType()[Number<0>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - - llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType()[Number<1>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + sizeof(float), - 0); - - llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType()[Number<2>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + 2 * sizeof(float), - 0); - - llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType()[Number<3>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + 3 * sizeof(float), - 0); - } - } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - llvm_amdgcn_raw_buffer_atomic_add_i32(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 2) - { - vector_type tmp{src_thread_data}; - - llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType()[Number<0>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - - llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType()[Number<1>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + sizeof(int32_t), - 0); - } - else if constexpr(N == 4) - { - vector_type tmp{src_thread_data}; - - llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType()[Number<0>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - - llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType()[Number<1>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + sizeof(int32_t), - 0); - - llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType()[Number<2>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + 2 * sizeof(int32_t), - 0); - - llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType()[Number<3>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + 3 * sizeof(int32_t), - 0); - } - } -} - -// buffer_load requires: -// 1) p_src_wave must point to global memory space -// 2) p_src_wave must be a wavewise pointer. -// It is user's responsibility to make sure that is true. -template -__device__ typename vector_type_maker::type::type -amd_buffer_load_invalid_element_return_return_zero(const T* p_src_wave, - index_t src_thread_element_offset, - bool src_thread_element_valid, - index_t src_element_space_size) -{ - const int32x4_t src_wave_buffer_resource = - make_wave_buffer_resource(p_src_wave, src_element_space_size); - - index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); - - using vector_t = typename vector_type_maker::type::type; - using scalar_t = typename scalar_type::type; - - constexpr index_t vector_size = scalar_type::vector_size; - -#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK - uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x7fffffff; - - return amd_buffer_load_impl( - src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0); -#else - vector_t tmp = amd_buffer_load_impl( - src_wave_buffer_resource, src_thread_addr_offset, 0); - - return src_thread_element_valid ? tmp : vector_t(0); -#endif -} - -// buffer_load requires: -// 1) p_src_wave must point to global memory space -// 2) p_src_wave must be a wavewise pointer. -// It is user's responsibility to make sure that is true. -template -__device__ typename vector_type_maker::type::type -amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave, - index_t src_thread_element_offset, - bool src_thread_element_valid, - index_t src_element_space_size, - T customized_value) -{ - const int32x4_t src_wave_buffer_resource = - make_wave_buffer_resource(p_src_wave, src_element_space_size); - - index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); - - using vector_t = typename vector_type_maker::type::type; - using scalar_t = typename scalar_type::type; - - constexpr index_t vector_size = scalar_type::vector_size; - - vector_t tmp = amd_buffer_load_impl( - src_wave_buffer_resource, src_thread_addr_offset, 0); - - return src_thread_element_valid ? tmp : vector_t(customized_value); -} - -// buffer_store requires: -// 1) p_dst_wave must point to global memory -// 2) p_dst_wave must be a wavewise pointer. -// It is user's responsibility to make sure that is true. -template -__device__ void amd_buffer_store(const typename vector_type_maker::type::type src_thread_data, - T* p_dst_wave, - const index_t dst_thread_element_offset, - const bool dst_thread_element_valid, - const index_t dst_element_space_size) -{ - const int32x4_t dst_wave_buffer_resource = - make_wave_buffer_resource(p_dst_wave, dst_element_space_size); - - index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); - - using vector_t = typename vector_type_maker::type::type; - using scalar_t = typename scalar_type::type; - constexpr index_t vector_size = scalar_type::vector_size; - -#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK - uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff; - - amd_buffer_store_impl( - src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0); -#else - if(dst_thread_element_valid) - { - amd_buffer_store_impl( - src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0); - } -#endif -} - -// buffer_atomic_add requires: -// 1) p_dst_wave must point to global memory -// 2) p_dst_wave must be a wavewise pointer. -// It is user's responsibility to make sure that is true. -template -__device__ void -amd_buffer_atomic_add(const typename vector_type_maker::type::type src_thread_data, - T* p_dst_wave, - const index_t dst_thread_element_offset, - const bool dst_thread_element_valid, - const index_t dst_element_space_size) -{ - const int32x4_t dst_wave_buffer_resource = - make_wave_buffer_resource(p_dst_wave, dst_element_space_size); - - index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); - - using vector_t = typename vector_type_maker::type::type; - using scalar_t = typename scalar_type::type; - constexpr index_t vector_size = scalar_type::vector_size; - -#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK - uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff; - - amd_buffer_atomic_add_impl( - src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0); -#else - if(dst_thread_element_valid) - { - amd_buffer_atomic_add_impl( - src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0); - } -#endif -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/amd_llvm_intrinsic.hpp b/composable_kernel/include/utility/amd_llvm_intrinsic.hpp deleted file mode 100644 index 841d48f81cb93d40182f377a26d8cfddd1abeb22..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/amd_llvm_intrinsic.hpp +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef CK_AMD_LLVM_INTRINSIC_HPP -#define CK_AMD_LLVM_INTRINSIC_HPP - -#include "data_type.hpp" - -namespace ck { - -__device__ int32_t llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane"); - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/amd_xdlops.hpp b/composable_kernel/include/utility/amd_xdlops.hpp deleted file mode 100644 index 083e47fbf1ebc9cf054edb9f0f454eb91c7896b5..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/amd_xdlops.hpp +++ /dev/null @@ -1,390 +0,0 @@ -#ifndef CK_AMD_XDLOPS_HPP -#define CK_AMD_XDLOPS_HPP - -#include "data_type.hpp" - -namespace ck { - -// A, B, C, cbsz, abid, blgp -extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x1f32( - float, float, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x1f32"); - -extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x2f32( - float, float, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x2f32"); - -extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x4f32( - float, float, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x4f32"); - -extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x1f32( - float, float, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x1f32"); - -extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x1f32( - float, float, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x1f32"); - -extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x4f16( - half4_t, half4_t, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x4f16"); - -extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x8f16( - half4_t, half4_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x8f16"); - -extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x16f16( - half4_t, half4_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x16f16"); - -extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x4f16( - half4_t, half4_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x4f16"); - -extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x4f16( - half4_t, half4_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x4f16"); - -extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x2bf16( - ushort2_t, ushort2_t, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x2bf16"); - -extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x4bf16( - ushort2_t, ushort2_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x4bf16"); - -extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x8bf16( - ushort2_t, ushort2_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x8bf16"); - -extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x2bf16( - ushort2_t, ushort2_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x2bf16"); - -extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x2bf16( - ushort2_t, ushort2_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x2bf16"); - -template -struct intrin_mfma_f32_32x32x1f32; - -template <> -struct intrin_mfma_f32_32x32x1f32<64, 64> -{ - template - __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_32x32x1f32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 1, 0, 0); - reg_c.template AsType()(Number<1>{}) = llvm_intrin_amdgcn_mfma_f32_32x32x1f32( - reg_a, reg_b, reg_c.template AsType()[Number<1>{}], 1, 1, 0); - } -}; - -template <> -struct intrin_mfma_f32_32x32x1f32<32, 64> -{ - template - __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_32x32x1f32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 1, 0, 0); - } -}; - -template -struct intrin_mfma_f32_32x32x2f32; - -template <> -struct intrin_mfma_f32_32x32x2f32<32, 32> -{ - template - __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_32x32x2f32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); - } -}; - -template -struct intrin_mfma_f32_16x16x4f32; - -template <> -struct intrin_mfma_f32_16x16x4f32<16, 16> -{ - template - __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_16x16x4f32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); - } -}; - -template -struct intrin_mfma_f32_16x16x1f32; - -template <> -struct intrin_mfma_f32_16x16x1f32<16, 64> -{ - template - __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c) - { - - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_16x16x1f32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 2, 0, 0); - } -}; - -template -struct intrin_mfma_f32_4x4x1f32; - -template <> -struct intrin_mfma_f32_4x4x1f32<4, 64> -{ - template - __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_4x4x1f32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 4, 0, 0); - } -}; - -template <> -struct intrin_mfma_f32_4x4x1f32<8, 64> -{ - template - __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_4x4x1f32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 4, 0, 0); - reg_c.template AsType()(Number<1>{}) = llvm_intrin_amdgcn_mfma_f32_4x4x1f32( - reg_a, reg_b, reg_c.template AsType()[Number<1>{}], 4, 1, 0); - } -}; - -template -struct intrin_mfma_f32_32x32x4f16; - -template <> -struct intrin_mfma_f32_32x32x4f16<64, 64> -{ - template - __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_32x32x4f16( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 1, 0, 0); - reg_c.template AsType()(Number<1>{}) = llvm_intrin_amdgcn_mfma_f32_32x32x4f16( - reg_a, reg_b, reg_c.template AsType()[Number<1>{}], 1, 1, 0); - } -}; - -template <> -struct intrin_mfma_f32_32x32x4f16<32, 64> -{ - template - __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_32x32x4f16( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 1, 0, 0); - } -}; - -template -struct intrin_mfma_f32_32x32x8f16; - -template <> -struct intrin_mfma_f32_32x32x8f16<32, 32> -{ - template - __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_32x32x8f16( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); - } -}; - -template -struct intrin_mfma_f32_16x16x16f16; - -template <> -struct intrin_mfma_f32_16x16x16f16<16, 16> -{ - template - __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_16x16x16f16( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); - } -}; - -template -struct intrin_mfma_f32_16x16x4f16; - -template <> -struct intrin_mfma_f32_16x16x4f16<16, 64> -{ - template - __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_16x16x4f16( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 2, 0, 0); - } -}; - -template -struct intrin_mfma_f32_4x4x4f16; - -template <> -struct intrin_mfma_f32_4x4x4f16<4, 64> -{ - template - __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_4x4x4f16( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 4, 0, 0); - } -}; - -template <> -struct intrin_mfma_f32_4x4x4f16<8, 64> -{ - template - __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = llvm_intrin_amdgcn_mfma_f32_4x4x4f16( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 4, 0, 0); - reg_c.template AsType()(Number<1>{}) = llvm_intrin_amdgcn_mfma_f32_4x4x4f16( - reg_a, reg_b, reg_c.template AsType()[Number<1>{}], 4, 1, 0); - } -}; - -#if 0 -template -struct intrin_mfma_f32_32x32x2bf16; - -template -struct intrin_mfma_f32_32x32x2bf16<128, 64, AStride, BStride> -{ - __device__ static c_vec32_4_t::VecType - run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_4_t::VecType reg_c) - { - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0); - reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 1, 1, 0); - - reg_c.s.z = - llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[AStride], reg_b[0], reg_c.s.z, 1, 0, 0); - reg_c.s.w = - llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[AStride], reg_b[0], reg_c.s.w, 1, 1, 0); - - return reg_c; - } -}; - -template -struct intrin_mfma_f32_32x32x2bf16<64, 128, AStride, BStride> -{ - __device__ static c_vec32_4_t::VecType - run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_4_t::VecType reg_c) - { - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0); - reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 1, 1, 0); - - reg_c.s.z = - llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[BStride], reg_c.s.z, 1, 0, 0); - reg_c.s.w = - llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[BStride], reg_c.s.w, 1, 1, 0); - - return reg_c; - } -}; - -template -struct intrin_mfma_f32_32x32x2bf16<64, 64, AStride, BStride> -{ - __device__ static c_vec32_2_t::VecType - run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_2_t::VecType reg_c) - { - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0); - reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 1, 1, 0); - - return reg_c; - } -}; - -template -struct intrin_mfma_f32_32x32x2bf16<64, 32, AStride, BStride> -{ - __device__ static c_vec32_1_t::VecType - run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_1_t::VecType reg_c) - { - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 1); - - return reg_c; - } -}; - -template -struct intrin_mfma_f32_32x32x2bf16<32, 64, AStride, BStride> -{ - __device__ static c_vec32_1_t::VecType - run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_1_t::VecType reg_c) - { - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0); - return reg_c; - } -}; - -__device__ c_vec16_1_t::VecType intrin_mfma_f32_32x32x4bf16(const ushort2_t* reg_a, - const ushort2_t* reg_b, - c_vec16_1_t::VecType reg_c) -{ - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x4bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 0); - return reg_c; -} - -__device__ c_vec4_1_t::VecType intrin_mfma_f32_16x16x8bf16(const ushort2_t* reg_a, - const ushort2_t* reg_b, - c_vec4_1_t::VecType reg_c) -{ - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_16x16x8bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 0); - return reg_c; -} - -template -__device__ c_vec16_1_t::VecType intrin_mfma_f32_16x16x2bf16(const ushort2_t* reg_a, - const ushort2_t* reg_b, - c_vec16_1_t::VecType reg_c); -template <> -__device__ c_vec16_1_t::VecType intrin_mfma_f32_16x16x2bf16<16, 64>(const ushort2_t* reg_a, - const ushort2_t* reg_b, - c_vec16_1_t::VecType reg_c) -{ - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 2, 0, 0); - return reg_c; -} - -template <> -__device__ c_vec16_1_t::VecType intrin_mfma_f32_16x16x2bf16<64, 16>(const ushort2_t* reg_a, - const ushort2_t* reg_b, - c_vec16_1_t::VecType reg_c) -{ - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 4); - return reg_c; -} - -template -struct intrin_mfma_f32_4x4x2bf16; - -template <> -struct intrin_mfma_f32_4x4x2bf16<4, 64> -{ - __device__ static c_vec4_1_t::VecType - run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec4_1_t::VecType reg_c) - { - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 4, 0, 0); - return reg_c; - } -}; - -template <> -struct intrin_mfma_f32_4x4x2bf16<8, 64> -{ - __device__ static c_vec4_2_t::VecType - run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec4_2_t::VecType reg_c) - { - reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 4, 0, 0); - reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 4, 1, 0); - return reg_c; - } -}; - -#endif - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp deleted file mode 100644 index 85c02a1b99d26a0ead02f802776cc065b9ffbc0b..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/common_header.hpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef CK_COMMON_HEADER_HPP -#define CK_COMMON_HEADER_HPP - -#include "config.hpp" -#include "array.hpp" -#include "container_helper.hpp" -#include "statically_indexed_array.hpp" -#include "container_element_picker.hpp" -#include "multi_index.hpp" -#include "data_type.hpp" -#include "data_type_enum.hpp" -#include "data_type_enum_helper.hpp" -#include "functional.hpp" -#include "functional2.hpp" -#include "functional3.hpp" -#include "functional4.hpp" -#include "enable_if.hpp" -#include "integral_constant.hpp" -#include "math.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "sequence_helper.hpp" -#include "synchronization.hpp" -#include "tuple.hpp" -#include "tuple_helper.hpp" -#include "type.hpp" -#include "magic_division.hpp" -#include "utility.hpp" -#include "c_style_pointer_cast.hpp" -#include "amd_address_space.hpp" -#include "amd_buffer_addressing.hpp" -#include "static_buffer.hpp" -#include "dynamic_buffer.hpp" - -#include "inner_product.hpp" - -// TODO: remove this -#if CK_USE_AMD_INLINE_ASM -#include "amd_inline_asm.hpp" -#endif - -#if CK_USE_AMD_XDLOPS -#include "amd_xdlops.hpp" -#endif - -#endif diff --git a/composable_kernel/include/utility/config.hpp b/composable_kernel/include/utility/config.hpp deleted file mode 100644 index 5ee4bb9c642fc2b015e13183ae6dab379cffbaed..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/config.hpp +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef CK_CONFIG_AMD_HPP -#define CK_CONFIG_AMD_HPP - -#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS -#include "hip/hip_runtime.h" -#include "hip/hip_fp16.h" -#endif -#include "bfloat16_dev.hpp" - -// "Constant" address space for kernel parameter -#define CONSTANT __attribute__((address_space(4))) - -// GPU target -// should enable one and only one GPU target -#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \ - defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030)) -#error Need to define (only) one GPU target -#endif - -// launch bounds -#define CK_USE_LAUNCH_BOUNDS 1 - -#ifdef CK_USE_LAUNCH_BOUNDS -#define CK_MAX_THREAD_PER_BLOCK 256 -#define CK_MIN_BLOCK_PER_CU 2 -#endif - -// buffer resourse -#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \ - defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) -#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 -#elif defined(CK_AMD_GPU_GFX1030) -#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 -#endif - -// FMA instruction -#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) -#define CK_USE_AMD_V_MAC_F32 -#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \ - defined(CK_AMD_GPU_GFX1030) -#define CK_USE_AMD_V_FMAC_F32 -#define CK_USE_AMD_V_DOT2_F32_F16 -#define CK_USE_AMD_V_DOT4_I32_I8 -#endif - -// multi index -#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0 - -// AMD inline asm -#ifndef CK_USE_AMD_INLINE_ASM -#define CK_USE_AMD_INLINE_ASM 1 -#endif - -// AMD inner product (DLOP) -#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM -#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1 -#endif - -// AMD buffer addressing -#ifndef CK_USE_AMD_BUFFER_ADDRESSING -#define CK_USE_AMD_BUFFER_ADDRESSING 1 -#endif - -// only gfx908 support native floating point atomic add -#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD -#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0 -#endif - -// AMD XDLOPS -#ifndef CK_USE_AMD_XDLOPS -#define CK_USE_AMD_XDLOPS 0 -#endif - -// block synchronization only s_wait lgkmcnt(0), not vmcnt(0) -#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM -#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 -#endif - -// experimental implementation -#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0 -#endif - -#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1 -#endif - -#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK -#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1 -#endif - -// pass tensor descriptor by value or void* -#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 -#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 - -// merge transformation use magic number division -#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0 - -// hack: have underlying assumption that need to be satsified, otherwise it's a bug -// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be -// thread-invariant, otherwise it's a bug -// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread" -#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE -#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0 -#endif - -// workaround for compiler crash when compiling recursive lambda -#ifndef CK_WORKAROUND_SWDEV_275126 -#define CK_WORKAROUND_SWDEV_275126 1 -#endif - -// workaround for compiler crash when using buffer load/store for i8 -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE -#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1 -#endif - -// workaround for compiler crash when using buffer load/store for i8 -#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE -#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1 -#endif - -namespace ck { - -enum InMemoryDataOperationEnum_t -{ - Set, - AtomicAdd -}; - -// index type -using index_t = int32_t; - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/data_type_enum.hpp b/composable_kernel/include/utility/data_type_enum.hpp deleted file mode 100644 index 35df0067a9a0e56198283b8768f92738ff2b2bab..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/data_type_enum.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef CK_DATA_TYPE_ENUM_HPP -#define CK_DATA_TYPE_ENUM_HPP - -namespace ck { - -enum DataTypeEnum_t -{ - Half = 0, - Float = 1, - Int32 = 2, - Int8 = 3, - Int8x4 = 4, - BFloat16 = 5, - Double = 6, - Unknown = 100, -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/data_type_enum_helper.hpp b/composable_kernel/include/utility/data_type_enum_helper.hpp deleted file mode 100644 index 451ce992b1f362c8bee7e71222d0381481826bbe..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/data_type_enum_helper.hpp +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef CK_DATA_TYPE_ENUM_HELPER_HPP -#define CK_DATA_TYPE_ENUM_HELPER_HPP - -#include "data_type.hpp" -#include "data_type_enum.hpp" - -namespace ck { - -template -struct get_datatype_from_enum; - -template <> -struct get_datatype_from_enum -{ - using type = int8_t; -}; - -template <> -struct get_datatype_from_enum -{ - using type = int32_t; -}; - -template <> -struct get_datatype_from_enum -{ - using type = half_t; -}; - -template <> -struct get_datatype_from_enum -{ - using type = float; -}; - -template <> -struct get_datatype_from_enum -{ - using type = double; -}; - -template -struct get_datatype_enum_from_type; - -template <> -struct get_datatype_enum_from_type -{ - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int8; -}; - -template <> -struct get_datatype_enum_from_type -{ - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int32; -}; - -template <> -struct get_datatype_enum_from_type -{ - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Half; -}; - -template <> -struct get_datatype_enum_from_type -{ - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Float; -}; - -template <> -struct get_datatype_enum_from_type -{ - static constexpr DataTypeEnum_t value = DataTypeEnum_t::Double; -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/dynamic_buffer.hpp b/composable_kernel/include/utility/dynamic_buffer.hpp deleted file mode 100644 index 886737efacd178db114e1300a7fa0c116b6cebe5..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/dynamic_buffer.hpp +++ /dev/null @@ -1,272 +0,0 @@ -#ifndef CK_BUFFER_HPP -#define CK_BUFFER_HPP - -#include "amd_buffer_addressing.hpp" -#include "c_style_pointer_cast.hpp" -#include "enable_if.hpp" - -namespace ck { - -template -struct DynamicBuffer -{ - using type = T; - - T* p_data_; - ElementSpaceSize element_space_size_; - T invalid_element_value_ = T{0}; - - __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size) - : p_data_{p_data}, element_space_size_{element_space_size} - { - } - - __host__ __device__ constexpr DynamicBuffer(T* p_data, - ElementSpaceSize element_space_size, - T invalid_element_value) - : p_data_{p_data}, - element_space_size_{element_space_size}, - invalid_element_value_{invalid_element_value} - { - } - - __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() - { - return BufferAddressSpace; - } - - __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; } - - __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; } - - template >::type, - typename scalar_type>::type>::value, - bool>::type = false> - __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const - { - // X contains multiple T - constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; - - constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; - - static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, - "wrong! X need to be multiple T"); - -#if CK_USE_AMD_BUFFER_ADDRESSING - bool constexpr use_amd_buffer_addressing = true; -#else - bool constexpr use_amd_buffer_addressing = false; -#endif - - if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global && use_amd_buffer_addressing) - { - constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; - - if constexpr(InvalidElementUseNumericalZeroValue) - { - return amd_buffer_load_invalid_element_return_return_zero, - t_per_x>( - p_data_, i, is_valid_element, element_space_size_); - } - else - { - return amd_buffer_load_invalid_element_return_customized_value, - t_per_x>( - p_data_, i, is_valid_element, element_space_size_, invalid_element_value_); - } - } - else - { - if constexpr(InvalidElementUseNumericalZeroValue) - { - return is_valid_element ? *c_style_pointer_cast(&p_data_[i]) : X{0}; - } - else - { - return is_valid_element ? *c_style_pointer_cast(&p_data_[i]) - : X{invalid_element_value_}; - } - } - } - - template >::type, - typename scalar_type>::type>::value, - bool>::type = false> - __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x) - { - // X contains multiple T - constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; - - constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; - - static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, - "wrong! X need to be multiple T"); - - if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global) - { -#if CK_USE_AMD_BUFFER_ADDRESSING - constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; - - amd_buffer_store, t_per_x>( - x, p_data_, i, is_valid_element, element_space_size_); -#else - if(is_valid_element) - { - *c_style_pointer_cast(&p_data_[i]) = x; - } -#endif - } - else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds) - { - if(is_valid_element) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE - *c_style_pointer_cast(&p_data_[i]) = x; -#else - // HACK: compiler would lower IR "store address_space(3)" into - // inefficient - // ISA, so I try to let compiler emit IR "store" which would be lower to - // ds_write_b128 - // TODO: remove this after compiler fix - if constexpr(is_same>::type, int8_t>::value) - { - static_assert((is_same, int8_t>::value && - is_same, int8_t>::value) || - (is_same, int8_t>::value && - is_same, int8x2_t>::value) || - (is_same, int8_t>::value && - is_same, int8x4_t>::value) || - (is_same, int8x4_t>::value && - is_same, int8x4_t>::value) || - (is_same, int8x8_t>::value && - is_same, int8x8_t>::value) || - (is_same, int8x16_t>::value && - is_same, int8x16_t>::value), - "wrong! not implemented for this combination, please add " - "implementation"); - - if constexpr(is_same, int8_t>::value && - is_same, int8_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8_t>::value && - is_same, int8x2_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8_t>::value && - is_same, int8x4_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8x4_t>::value && - is_same, int8x4_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8x8_t>::value && - is_same, int8x8_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - else if constexpr(is_same, int8x16_t>::value && - is_same, int8x16_t>::value) - { - // HACK: cast pointer of x is bad - // TODO: remove this after compiler fix - *c_style_pointer_cast(&p_data_[i]) = - *c_style_pointer_cast(&x); - } - } - else - { - *c_style_pointer_cast(&p_data_[i]) = x; - } -#endif - } - } - else - { - if(is_valid_element) - { - *c_style_pointer_cast(&p_data_[i]) = x; - } - } - } - - template >::type, - typename scalar_type>::type>::value, - bool>::type = false> - __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x) - { - // X contains multiple T - constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; - - constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; - - static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, - "wrong! X need to be multiple T"); - - static_assert(GetAddressSpace() == AddressSpaceEnum_t::Global, "only support global mem"); - -#if CK_USE_AMD_BUFFER_ADDRESSING - constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; - - amd_buffer_atomic_add, t_per_x>( - x, p_data_, i, is_valid_element, element_space_size_); -#else - if(is_valid_element) - { - atomicAdd(&p_data_[i], x); - } -#endif - } - - __host__ __device__ static constexpr bool IsStaticBuffer() { return false; } - - __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; } -}; - -template -__host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size) -{ - return DynamicBuffer{p, element_space_size}; -} - -template < - AddressSpaceEnum_t BufferAddressSpace, - typename T, - typename ElementSpaceSize, - typename X, - typename enable_if, remove_cvref_t>::value, bool>::type = false> -__host__ __device__ constexpr auto -make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value) -{ - return DynamicBuffer{ - p, element_space_size, invalid_element_value}; -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/enable_if.hpp b/composable_kernel/include/utility/enable_if.hpp deleted file mode 100644 index 501e1bfc1cbbc8c07ebf3500edb468386b8a7f27..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/enable_if.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef CK_ENABLE_IF_HPP -#define CK_ENABLE_IF_HPP - -namespace ck { - -template -using enable_if = std::enable_if; - -template -using enable_if_t = typename std::enable_if::type; - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/integral_constant.hpp b/composable_kernel/include/utility/integral_constant.hpp deleted file mode 100644 index 14f3df894bebf5d6056548640335ac2291324b11..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/integral_constant.hpp +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef CK_INTEGRAL_CONSTANT_HPP -#define CK_INTEGRAL_CONSTANT_HPP - -namespace ck { - -template -struct integral_constant -{ - static constexpr T value = v; - typedef T value_type; - typedef integral_constant type; - __host__ __device__ constexpr operator value_type() const noexcept { return value; } - __host__ __device__ constexpr value_type operator()() const noexcept { return value; } -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/magic_division.hpp b/composable_kernel/include/utility/magic_division.hpp deleted file mode 100644 index 612aceea2a122a99845ea97f7c37e5858557c0af..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/magic_division.hpp +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef CK_MAGIC_DIVISION_HPP -#define CK_MAGIC_DIVISION_HPP - -#include "config.hpp" -#include "integral_constant.hpp" -#include "number.hpp" -#include "type.hpp" -#include "tuple.hpp" - -namespace ck { - -// magic number division -// Caution: -// 1. For uint32_t as dividend: magic number division implementation being used would produce -// correct result if the dividend is uint32_t and its value is within 31-bit value range. -// 2. For int32_t as dividendd: magic number division for int32_t dividened has not been -// implemented, the int32_t dividend would be bit-wise interpreted as uint32_t and magic number -// division implementation for uint32_t is then used. Therefore, dividend value need to be -// non-negative. -// TODO: -// 1. Implement magic number divison for int32_t -// 2. Implement magic number divison for unit32_t with 32-bit value range -struct MagicDivision -{ - // uint32_t - __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor) - { - // assert(divisior >= 1 && divisior <= INT32_MAX); - uint32_t shift = 0; - for(shift = 0; shift < 32; ++shift) - { - if((1U << shift) >= divisor) - { - break; - } - } - - uint64_t one = 1; - uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1; - // assert(multiplier <= 0xffffffffUL); - - return make_tuple(uint32_t(multiplier), shift); - } - - __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor) - { - auto tmp = CalculateMagicNumbers(divisor); - - return tmp[Number<0>{}]; - } - - __host__ __device__ static constexpr uint32_t CalculateMagicShift(uint32_t divisor) - { - auto tmp = CalculateMagicNumbers(divisor); - - return tmp[Number<1>{}]; - } - - // integral_constant - template - __host__ __device__ static constexpr auto - CalculateMagicNumbers(integral_constant) - { - constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor}); - - constexpr uint32_t multiplier = tmp[Number<0>{}]; - constexpr uint32_t shift = tmp[Number<1>{}]; - - return make_tuple(integral_constant{}, - integral_constant{}); - } - - template - __host__ __device__ static constexpr auto - CalculateMagicMultiplier(integral_constant) - { - constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor}); - - return integral_constant{}; - } - - template - __host__ __device__ static constexpr auto - CalculateMagicShift(integral_constant) - { - constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor}); - - return integral_constant{}; - } - - // integral_constant - template - __host__ __device__ static constexpr auto - CalculateMagicNumbers(integral_constant) - { - return CalculateMagicNumbers(integral_constant{}); - } - - template - __host__ __device__ static constexpr auto - CalculateMagicMultiplier(integral_constant) - { - return CalculateMagicMultiplier(integral_constant{}); - } - - template - __host__ __device__ static constexpr auto - CalculateMagicShift(integral_constant) - { - return CalculateMagicShift(integral_constant{}); - } - - // magic division for uint32_t - __host__ __device__ static constexpr uint32_t - DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift) - { - uint32_t tmp = __umulhi(dividend, multiplier); - return (tmp + dividend) >> shift; - } - - // magic division for int32_t - // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be - // non-negative for result to be correct - // TODO: figure out how to do magic number divison for int32_t as dividended - __host__ __device__ static constexpr int32_t - DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift) - { - uint32_t dividend_u32 = as_type(dividend_i32); - uint32_t tmp = __umulhi(dividend_u32, multiplier); - return (tmp + dividend_u32) >> shift; - } -}; - -} // namespace ck - -#endif diff --git a/composable_kernel/include/utility/multi_index.hpp b/composable_kernel/include/utility/multi_index.hpp deleted file mode 100644 index 0bb34fb1e2aebd95e12ccbc866b518a820e424b8..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/multi_index.hpp +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef CK_MULTI_INDEX_HPP -#define CK_MULTI_INDEX_HPP - -#include "common_header.hpp" - -#if CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX -#include "array_multi_index.hpp" -#else -#include "statically_indexed_array_multi_index.hpp" -#endif - -#endif diff --git a/composable_kernel/include/utility/number.hpp b/composable_kernel/include/utility/number.hpp deleted file mode 100644 index f8c56436940c5542575e30d0177818c791916a94..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/number.hpp +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef CK_NUMBER_HPP -#define CK_NUMBER_HPP - -#include "integral_constant.hpp" - -namespace ck { - -template -using Number = integral_constant; - -template -__host__ __device__ constexpr auto operator+(Number, Number) -{ - return Number{}; -} - -template -__host__ __device__ constexpr auto operator-(Number, Number) -{ - static_assert(Y <= X, "wrong!"); - return Number{}; -} - -template -__host__ __device__ constexpr auto operator*(Number, Number) -{ - return Number{}; -} - -template -__host__ __device__ constexpr auto operator/(Number, Number) -{ - static_assert(Y > 0, "wrong!"); - return Number{}; -} - -template -__host__ __device__ constexpr auto operator%(Number, Number) -{ - static_assert(Y > 0, "wrong!"); - return Number{}; -} -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/reduction_common.hpp b/composable_kernel/include/utility/reduction_common.hpp deleted file mode 100644 index ff574c315c18a01075196d661000a19f493882e7..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/reduction_common.hpp +++ /dev/null @@ -1,53 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_REDUCTION_COMMON_HPP -#define CK_REDUCTION_COMMON_HPP - -#include "reduction_enums.hpp" - -namespace ck { - -struct float_equal_one -{ - template - __device__ inline bool operator()(T x) - { - return x <= static_cast(1.0f) and x >= static_cast(1.0f); - }; -}; - -struct float_equal_zero -{ - template - __device__ inline bool operator()(T x) - { - return x <= static_cast(0.0f) and x >= static_cast(0.0f); - }; -}; - -}; // end of namespace ck - -#endif diff --git a/composable_kernel/include/utility/reduction_enums.hpp b/composable_kernel/include/utility/reduction_enums.hpp deleted file mode 100644 index e97108179ea85f55ac5eb116fc41b344d84137b8..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/reduction_enums.hpp +++ /dev/null @@ -1,66 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_REDUCTION_ENUMS_HPP -#define CK_REDUCTION_ENUMS_HPP - -namespace ck { - -enum class ReduceTensorOp_t -{ - ADD = 0, - MUL = 1, - MIN = 2, - MAX = 3, - AMAX = 4, - AVG = 5, - NORM1 = 6, - NORM2 = 7, - // MUL_NO_ZEROS = 8, -}; - -enum class NanPropagation_t -{ - NOT_PROPAGATE_NAN = 0, - PROPAGATE_NAN = 1, -}; - -enum class ReduceTensorIndices_t -{ - NO_INDICES = 0, - FLATTENED_INDICES = 1, -}; - -enum class IndicesType_t -{ - INDICES_32BIT = 0, - INDICES_64BIT = 1, - INDICES_16BIT = 2, - INDICES_8BIT = 3, -}; - -}; // end of namespace ck - -#endif diff --git a/composable_kernel/include/utility/reduction_functions_binop.hpp b/composable_kernel/include/utility/reduction_functions_binop.hpp deleted file mode 100644 index 5285abee81eb8e14382667e5c9c450a62206e936..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/reduction_functions_binop.hpp +++ /dev/null @@ -1,100 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_REDUCTION_FUNCTIONS_BINOP_HPP -#define CK_REDUCTION_FUNCTIONS_BINOP_HPP - -#include "data_type.hpp" - -#include "reduction_common.hpp" -#include "reduction_operator.hpp" - -namespace ck { -namespace detail { - -static inline __device__ bool isnan(half_t x) { return __hisnan(x); }; - -template -struct binop_with_nan_check; - -template -struct binop_with_nan_check -{ - // cppcheck-suppress constParameter - __device__ static inline void calculate(compType& accuVal, compType currVal) - { - opReduce{}(accuVal, currVal); - }; - - // The method is called when the opReduce is indexable and the user asked for indices - __device__ static inline void - // cppcheck-suppress constParameter - calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex) - { - bool changed = false; - - opReduce{}(accuVal, currVal, changed); - - if(changed) - accuIndex = currIndex; - }; -}; - -template -struct binop_with_nan_check -{ - __device__ static inline void calculate(compType& accuVal, compType currVal) - { - if(isnan(currVal)) - accuVal = currVal; - else - opReduce{}(accuVal, currVal); - }; - - // The method is called when the opReduce is indexable and the user asked for indices - __device__ static inline void - calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex) - { - if(isnan(currVal)) - { - accuVal = currVal; - accuIndex = currIndex; - } - else - { - bool changed = false; - - opReduce{}(accuVal, currVal, changed); - - if(changed) - accuIndex = currIndex; - } - }; -}; - -}; // namespace detail -}; // end of namespace ck - -#endif diff --git a/composable_kernel/include/utility/reduction_operator.hpp b/composable_kernel/include/utility/reduction_operator.hpp deleted file mode 100644 index c0afbec8695c2b083cbc0d6b67ff111152420632..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/reduction_operator.hpp +++ /dev/null @@ -1,419 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef CK_REDUCTION_OPERATOR_HPP -#define CK_REDUCTION_OPERATOR_HPP - -#include "reduction_common.hpp" - -namespace ck { - -namespace reduce { - -// Every binary operator used in reduction is represented by a templated functor class. Each functor -// class must provide at least -// three members: -// 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary -// operator, "identity element" is the unique -// element in the algebraic space that doesn't affect the value of other elements -// when operated against them, and the concept is similar to zero vector in -// vector space -// (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf). -// 2) indexable -- boolean value indicating whether indices of the operated elements could be -// recorded. Usually, Min/Max operator could -// need to record the indices of elements. For operator like Add/Mul, no need to -// record the indices. -// 3) operator() -- the first argument of the operator must be both an input & output, and the -// corresponding variable usually stores -// the accumulated result of many operator() calls; the second argument is only an -// input. For indexable binary -// operator, the second version of operator() has third argument (which is an -// output) to indicate whether the -// accumulated value (the first argument) has changed, in which case the recorded -// accumulated index also need be -// changed. - -template -struct Add -{ - using dataType = T; - - __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; - - __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; } - - static constexpr bool indexable = false; -}; - -template -struct Mul -{ - using dataType = T; - - __device__ static constexpr T GetReductionZeroVal() { return static_cast(1.0f); }; - - __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; } - - static constexpr bool indexable = false; -}; - -template -struct Max -{ - using dataType = T; - - __device__ static constexpr T GetReductionZeroVal() { return NumericLimits::Lowest(); }; - - __device__ inline constexpr void operator()(T& a, T b) const - { - if(a < b) - a = b; - } - - __device__ inline constexpr void operator()(T& a, T b, bool& changed) const - { - if(a < b) - { - a = b; - changed = true; - } - } - - static constexpr bool indexable = true; -}; - -template -struct Min -{ - using dataType = T; - - __device__ static constexpr T GetReductionZeroVal() { return NumericLimits::Max(); }; - - __device__ inline constexpr void operator()(T& a, T b) const - { - if(a > b) - a = b; - } - - __device__ inline constexpr void operator()(T& a, T b, bool& changed) const - { - if(a > b) - { - a = b; - changed = true; - } - } - - static constexpr bool indexable = true; -}; - -template -struct AMax -{ - using dataType = T; - - __device__ static constexpr T GetReductionZeroVal() { return static_cast(0.0f); }; - - __device__ inline constexpr void operator()(T& a, T b) const - { - if(a < b) - a = b; - } - - __device__ inline constexpr void operator()(T& a, T b, bool& changed) const - { - if(a < b) - { - a = b; - changed = true; - } - } - - static constexpr bool indexable = true; -}; - -// Unary operators are usually called element-wisely before the reduction is executed on the -// elements. -// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2 -template -struct unary_identic -{ - __device__ unary_identic(const int divider = 1) - { - scaler = 1.0f / static_cast(divider); - }; - - __device__ inline constexpr T operator()(T a) const { return a * type_convert{}(scaler); }; - - float scaler = 1.0f; -}; - -template -struct unary_identic -{ - __device__ unary_identic(const int divider = 1) { (void)divider; }; - - __device__ inline constexpr T operator()(T a) const { return a; }; -}; - -template -struct unary_square -{ - __device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast(divider); }; - - __device__ inline constexpr T operator()(T a) const - { - a = a * a; - - return a * type_convert{}(scaler); - }; - - float scaler = 1.0f; -}; - -template -struct unary_square -{ - __device__ unary_square(const int divider = 1) { (void)divider; }; - - __device__ inline constexpr T operator()(T a) const { return a * a; }; -}; - -template -struct unary_abs -{ - __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast(divider); }; - - __device__ inline constexpr T operator()(T a) const - { - a = abs(a); - - return a * type_convert{}(scaler); - }; - - float scaler = 1.0f; -}; - -template -struct unary_abs -{ - __device__ unary_abs(const int divider = 1) { (void)divider; }; - - __device__ inline constexpr T operator()(T a) const { return abs(a); }; -}; - -// We know for sure that 4.0 has __habs(), but 3.0 does not have it. -// Let's assume that __habs() exists since 3.5. -#if HIP_PACKAGE_VERSION_FLAT < 3005000000 -inline __device__ __half __habs(__half x) -{ - union - { - __half half; - unsigned short u16; - } val; - val.half = x; - val.u16 = val.u16 & 0x7fff; - return val.half; -} -#endif - -template -struct unary_abs -{ - __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast(divider); }; - - __device__ inline half_t operator()(half_t a) const - { - a = static_cast(__habs(a)); - - return a * type_convert{}(scaler); - }; - - float scaler = 1.0f; -}; - -template <> -struct unary_abs -{ - __device__ unary_abs(const int divider = 1) { (void)divider; }; - - __device__ inline half_t operator()(half_t a) const { return static_cast(__habs(a)); }; -}; - -template -struct unary_sqrt -{ - __device__ unary_sqrt(const int divider = 1) { (void)divider; }; - - __device__ inline T operator()(T a) const { return sqrtf(a); }; -}; - -template <> -struct unary_sqrt -{ - __device__ unary_sqrt(const int divider = 1) { (void)divider; }; - - __device__ inline half_t operator()(half_t a) const { return static_cast(hsqrt(a)); }; -}; - -}; // end of namespace reduce - -// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their -// respective functor classes. -// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in -// reduce_binary_operactor for -// easier checking by the upper-layer codes in the kernels. - -template -struct reduce_binary_operator; - -template -struct reduce_binary_operator -{ - using opType = reduce::Add; - using dataType = T; - - static constexpr bool indexable = reduce::Add::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Mul; - using dataType = T; - - static constexpr bool indexable = reduce::Mul::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Min; - using dataType = T; - - static constexpr bool indexable = reduce::Min::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Max; - using dataType = T; - - static constexpr bool indexable = reduce::Max::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::AMax; - using dataType = T; - - static constexpr bool indexable = reduce::Max::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Add; - using dataType = T; - - static constexpr bool indexable = reduce::Add::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Add; - using dataType = T; - - static constexpr bool indexable = reduce::Add::indexable; -}; - -template -struct reduce_binary_operator -{ - using opType = reduce::Add; - using dataType = T; - - static constexpr bool indexable = reduce::Add::indexable; -}; - -// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary -// functor classes. -// The two unary functors are called before and afer the Reduction is executed respectively -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_identic; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_identic; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_abs; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_abs; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_square; - using posUnaryOp = reduce::unary_identic; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_square; - using posUnaryOp = reduce::unary_sqrt; -}; - -template -struct reduce_unary_operator -{ - using preUnaryOp = reduce::unary_identic; - using posUnaryOp = reduce::unary_sqrt; -}; - -} // end of namespace ck - -#endif diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp deleted file mode 100644 index 9615d10c5971f90fe2205b2ed3d444ba8f2e2c0f..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/static_buffer.hpp +++ /dev/null @@ -1,163 +0,0 @@ -#ifndef CK_STATIC_BUFFER_HPP -#define CK_STATIC_BUFFER_HPP - -#include "statically_indexed_array.hpp" - -namespace ck { - -template -struct StaticBuffer : public StaticallyIndexedArray -{ - using type = T; - using base = StaticallyIndexedArray; - - T invalid_element_value_ = T{0}; - - __host__ __device__ constexpr StaticBuffer() : base{} {} - - __host__ __device__ constexpr StaticBuffer(T invalid_element_value) - : base{}, invalid_element_value_{invalid_element_value} - { - } - - __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() - { - return BufferAddressSpace; - } - - template - __host__ __device__ constexpr auto Get(Number i, bool is_valid_element) const - { - if constexpr(InvalidElementUseNumericalZeroValue) - { - return is_valid_element ? At(i) : T{0}; - } - else - { - return is_valid_element ? At(i) : invalid_element_value_; - } - } - - template - __host__ __device__ void Set(Number i, bool is_valid_element, const T& x) - { - if(is_valid_element) - { - At(i) = x; - } - } - - __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } - - __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } -}; - -template -struct StaticBufferV2 : public StaticallyIndexedArray -{ - using type = T; - using base = StaticallyIndexedArray; - - using VecBaseType = typename T::d1_t; - - __host__ __device__ static constexpr index_t GetVectorSize() - { - return sizeof(typename T::type) / sizeof(VecBaseType); - } - - static constexpr index_t vector_size = GetVectorSize(); - - VecBaseType invalid_element_value_ = VecBaseType{0}; - - T invalid_vec_value_ = T{0}; - - __host__ __device__ constexpr StaticBufferV2() : base{} {} - - __host__ __device__ constexpr StaticBufferV2(VecBaseType invalid_element_value) - : base{}, - invalid_vec_value_{invalid_element_value}, - invalid_element_value_{invalid_element_value} - { - } - - __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() - { - return BufferAddressSpace; - } - - template - __host__ __device__ constexpr auto& GetVector(Number vec_id) - { - return this->At(vec_id); - } - - template - __host__ __device__ constexpr const auto& GetVector(Number vec_id) const - { - return this->At(vec_id); - } - - template - __host__ __device__ constexpr auto& GetElement(Number i, bool) - { - constexpr auto vec_id = Number{}; - constexpr auto vec_off = Number{}; - - return this->At(vec_id).template AsType()(vec_off); - } - - template - __host__ __device__ constexpr auto GetElement(Number i, bool is_valid_element) const - { - constexpr auto vec_id = Number{}; - constexpr auto vec_off = Number{}; - - if constexpr(InvalidElementUseNumericalZeroValue) - { - return is_valid_element ? this->At(vec_id).template AsType()[vec_off] - : VecBaseType{0}; - } - else - { - return is_valid_element ? this->At(vec_id).template AsType()[vec_off] - : invalid_element_value_; - } - } - - template - __host__ __device__ constexpr auto operator[](Number i) const - { - return GetElement(i, true); - } - - template - __host__ __device__ constexpr auto& operator()(Number i) - { - return GetElement(i, true); - } - - __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } - - __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } -}; - -template -__host__ __device__ constexpr auto make_static_buffer(Number) -{ - return StaticBuffer{}; -} - -template -__host__ __device__ constexpr auto make_static_buffer(Number, T invalid_element_value) -{ - return StaticBuffer{invalid_element_value}; -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/statically_indexed_array.hpp b/composable_kernel/include/utility/statically_indexed_array.hpp deleted file mode 100644 index f30a3a9ee637cea7386bde190aa0fef7bdfcc736..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/statically_indexed_array.hpp +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef CK_STATICALLY_INDEXED_ARRAY_HPP -#define CK_STATICALLY_INDEXED_ARRAY_HPP - -#include "functional2.hpp" -#include "sequence.hpp" -#include "tuple.hpp" - -namespace ck { - -namespace detail { - -template -__host__ __device__ constexpr auto generate_same_type_tuple() -{ - return generate_tuple([](auto) -> T { return T{}; }, Number{}); -} - -template -using same_type_tuple = decltype(generate_same_type_tuple()); - -} // namespace detail - -template -using StaticallyIndexedArray = detail::same_type_tuple; - -template -__host__ __device__ constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs) -{ - return StaticallyIndexedArray(x, static_cast(xs)...); -} - -// make empty StaticallyIndexedArray -template -__host__ __device__ constexpr auto make_statically_indexed_array() -{ - return StaticallyIndexedArray(); -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/statically_indexed_array_multi_index.hpp b/composable_kernel/include/utility/statically_indexed_array_multi_index.hpp deleted file mode 100644 index 9e96f06d737e9e754ee1f7deaed4e66647482b4f..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/statically_indexed_array_multi_index.hpp +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP -#define CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP - -#include "common_header.hpp" - -namespace ck { - -template -using MultiIndex = StaticallyIndexedArray; - -template -__host__ __device__ constexpr auto make_multi_index(Xs&&... xs) -{ - return make_statically_indexed_array(index_t{xs}...); -} - -template -__host__ __device__ constexpr auto make_zero_multi_index() -{ - return unpack([](auto... xs) { return make_multi_index(xs...); }, - typename uniform_sequence_gen::type{}); -} - -template -__host__ __device__ constexpr auto to_multi_index(const T& x) -{ - return unpack([](auto... ys) { return make_multi_index(ys...); }, x); -} - -// Here should use MultiIndex, instead of Tuple, although the former -// is the alias of the latter. This is because compiler cannot infer the NSize if -// using MultiIndex -// TODO: how to fix this? -template -__host__ __device__ constexpr auto operator+=(Tuple& y, const X& x) -{ - static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same"); - constexpr index_t NSize = sizeof...(Ys); - static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; }); - return y; -} - -template -__host__ __device__ constexpr auto operator-=(Tuple& y, const X& x) -{ - static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same"); - constexpr index_t NSize = sizeof...(Ys); - static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; }); - return y; -} - -template -__host__ __device__ constexpr auto operator+(const Tuple& x, const Y& y) -{ - static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same"); - constexpr index_t NSize = sizeof...(Xs); - - Tuple r; - static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] + y[i]; }); - return r; -} - -template -__host__ __device__ constexpr auto operator-(const Tuple& x, const Y& y) -{ - static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same"); - constexpr index_t NSize = sizeof...(Xs); - - Tuple r; - static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] - y[i]; }); - return r; -} - -template -__host__ __device__ constexpr auto operator*(const Tuple& x, const Y& y) -{ - static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same"); - constexpr index_t NSize = sizeof...(Xs); - - Tuple r; - static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y[i]; }); - return r; -} - -// MultiIndex = index_t * MultiIndex -template -__host__ __device__ constexpr auto operator*(index_t a, const Tuple& x) -{ - constexpr index_t NSize = sizeof...(Xs); - - Tuple r; - static_for<0, NSize, 1>{}([&](auto i) { r(i) = a * x[i]; }); - return r; -} - -template -__host__ __device__ void print_multi_index(const Tuple& x) -{ - printf("{"); - printf("MultiIndex, "); - printf("size %d,", index_t{sizeof...(Xs)}); - static_for<0, sizeof...(Xs), 1>{}( - [&](auto i) { printf("%d ", static_cast(x.At(i))); }); - printf("}"); -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/synchronization.hpp b/composable_kernel/include/utility/synchronization.hpp deleted file mode 100644 index da74f2074db33a9dbcb919e0eca36d55548180de..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/synchronization.hpp +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef CK_SYNCHRONIZATION_AMD_HPP -#define CK_SYNCHRONIZATION_AMD_HPP - -#include "config.hpp" - -namespace ck { - -__device__ void block_sync_lds() -{ -#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM - asm volatile("\ - s_waitcnt lgkmcnt(0) \n \ - s_barrier \ - " ::); -#else - __syncthreads(); -#endif -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp deleted file mode 100644 index 70f4d77d87431d9599638aba24424f35e1957e0e..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/tuple.hpp +++ /dev/null @@ -1,166 +0,0 @@ -#ifndef CK_TUPLE_HPP -#define CK_TUPLE_HPP - -#include "integral_constant.hpp" -#include "sequence.hpp" -#include "type.hpp" -#include "enable_if.hpp" - -namespace ck { - -namespace detail { - -template -struct TupleElementKey -{ - __host__ __device__ constexpr TupleElementKey() = default; -}; - -template -struct TupleElement -{ - __host__ __device__ constexpr TupleElement() = default; - - template >, TupleElement>::value, - bool>::type = false> - __host__ __device__ constexpr TupleElement(T&& v) : mData(std::forward(v)) - { - } - - Data mData; -}; - -template -__host__ __device__ constexpr const Data& get_tuple_element(const TupleElement& x) -{ - return static_cast(x.mData); -} - -template -__host__ __device__ constexpr Data& get_tuple_element(TupleElement& x) -{ - return x.mData; -} - -// TODO: not sure the use of reference is correct -template -__host__ __device__ constexpr Data&& get_tuple_element(TupleElement&& x) -{ - return static_cast(x.mData); -} - -template -struct TupleImpl; - -template -struct TupleImpl, Xs...> : TupleElement, Xs>... -{ - __host__ __device__ constexpr TupleImpl() = default; - - template >, TupleImpl>::value, - bool>::type = false> - __host__ __device__ constexpr TupleImpl(Y&& y) - : TupleElement, Xs>(std::forward(y))... - { - } - - template = 2, bool>::type = false> - __host__ __device__ constexpr TupleImpl(Ys&&... ys) - : TupleElement, Xs>(std::forward(ys))... - { - static_assert(sizeof...(Is) == sizeof...(Xs) && sizeof...(Is) == sizeof...(Ys), - "wrong! inconsistent size"); - } - - __host__ __device__ static constexpr index_t Size() { return sizeof...(Xs); } - - template - __host__ __device__ constexpr const auto& GetElementByKey(TupleElementKey) const - { - return get_tuple_element>(*this); - } - - template - __host__ __device__ constexpr auto& GetElementByKey(TupleElementKey) - { - return get_tuple_element>(*this); - } -}; - -} // namespace detail - -template -struct Tuple : detail::TupleImpl::type, Xs...> -{ - using base = - detail::TupleImpl::type, Xs...>; - - __host__ __device__ constexpr Tuple() = default; - - template >, Tuple>::value, - bool>::type = false> - __host__ __device__ constexpr Tuple(Y&& y) : base(std::forward(y)) - { - } - - template = 2, bool>::type = - false> - __host__ __device__ constexpr Tuple(Ys&&... ys) : base(std::forward(ys)...) - { - } - - __host__ __device__ static constexpr index_t Size() { return sizeof...(Xs); } - - template - __host__ __device__ constexpr const auto& At(Number) const - { - static_assert(I < base::Size(), "wrong! out of range"); - return base::GetElementByKey(detail::TupleElementKey{}); - } - - template - __host__ __device__ constexpr auto& At(Number) - { - static_assert(I < base::Size(), "wrong! out of range"); - return base::GetElementByKey(detail::TupleElementKey{}); - } - - template - __host__ __device__ constexpr const auto& operator[](Number i) const - { - return At(i); - } - - template - __host__ __device__ constexpr auto& operator()(Number i) - { - return At(i); - } - - template - __host__ __device__ constexpr auto operator=(const T& a) - { - static_assert(T::Size() == Size(), "wrong! size not the same"); - - static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; }); - - return *this; - } - - __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } -}; - -template -__host__ __device__ constexpr auto make_tuple(Xs&&... xs) -{ - return Tuple...>(std::forward(xs)...); -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/tuple_helper.hpp b/composable_kernel/include/utility/tuple_helper.hpp deleted file mode 100644 index 55a79d2594e2ec5dac2f3113060a0e608d369eae..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/tuple_helper.hpp +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef CK_TUPLE_HELPER_HPP -#define CK_TUPLE_HELPER_HPP - -#include "functional4.hpp" -#include "tuple.hpp" - -namespace ck { - -template -struct is_known_at_compile_time> -{ - __host__ __device__ static constexpr bool IsKnownAtCompileTime() - { - return container_reduce( - Tuple{}, - [](auto x, bool r) { - return is_known_at_compile_time>::value & r; - }, - true); - } - - static constexpr bool value = IsKnownAtCompileTime(); -}; - -template -__host__ __device__ constexpr auto generate_tuple(F&& f, Number) -{ - return unpack([&f](auto&&... xs) { return make_tuple(f(xs)...); }, - typename arithmetic_sequence_gen<0, N, 1>::type{}); -} - -namespace detail { - -template -__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence) -{ - return make_tuple(f(x.At(Number{}))...); -} - -template -__host__ __device__ constexpr auto -transform_tuples_impl(F f, const X& x, const Y& y, Sequence) -{ - return make_tuple(f(x.At(Number{}), y.At(Number{}))...); -} - -template -__host__ __device__ constexpr auto -transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence) -{ - return make_tuple(f(x.At(Number{}), y.At(Number{}), z.At(Number{}))...); -} - -} // namespace detail - -template -__host__ __device__ constexpr auto transform_tuples(F f, const X& x) -{ - return detail::transform_tuples_impl( - f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{}); -} - -template -__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y) -{ - return detail::transform_tuples_impl( - f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{}); -} - -template -__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z) -{ - return detail::transform_tuples_impl( - f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{}); -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/type.hpp b/composable_kernel/include/utility/type.hpp deleted file mode 100644 index 89a2bdbde633e53b9ff078e5ab07b2ba029d3700..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/type.hpp +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef CK_TYPE_HPP -#define CK_TYPE_HPP - -#include "integral_constant.hpp" -#include "enable_if.hpp" - -namespace ck { - -template -struct is_same : public integral_constant -{ -}; - -template -struct is_same : public integral_constant -{ -}; - -template -using remove_reference_t = typename std::remove_reference::type; - -template -using remove_cv_t = typename std::remove_cv::type; - -template -using remove_cvref_t = remove_cv_t>; - -template -inline constexpr bool is_pointer_v = std::is_pointer::value; - -template -struct is_known_at_compile_time; - -template <> -struct is_known_at_compile_time -{ - static constexpr bool value = false; -}; - -template -struct is_known_at_compile_time> -{ - static constexpr bool value = true; -}; - -template ::type = false> -__host__ __device__ constexpr Y as_type(X x) -{ - union AsType - { - X x; - Y y; - }; - - return AsType{x}.y; -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/utility.hpp b/composable_kernel/include/utility/utility.hpp deleted file mode 100644 index 9f34e044b71e561bb9b1d192bd0e5a39a1acb489..0000000000000000000000000000000000000000 --- a/composable_kernel/include/utility/utility.hpp +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef CK_UTILITY_HPP -#define CK_UTILITY_HPP - -#include "config.hpp" - -namespace ck { - -__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } - -__device__ index_t get_block_1d_id() { return blockIdx.x; } - -} // namespace ck - -#endif diff --git a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp deleted file mode 100644 index 09a7fffa3ed777da39bfebcbe202cb69353c3c24..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp +++ /dev/null @@ -1,370 +0,0 @@ -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "gridwise_gemm_dlops_v1r2.hpp" -#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp" - -using namespace ck; - -constexpr DataTypeEnum_t ABDataTypeEnum = static_cast(CK_PARAM_ABDataTypeEnum); -constexpr DataTypeEnum_t AccDataTypeEnum = static_cast(CK_PARAM_AccDataTypeEnum); -constexpr DataTypeEnum_t CDataTypeEnum = static_cast(CK_PARAM_CDataTypeEnum); - -using FloatAB = typename get_datatype_from_enum::type; -using FloatAcc = typename get_datatype_from_enum::type; -using FloatC = typename get_datatype_from_enum::type; - -constexpr index_t BlockSize = CK_PARAM_BlockSize; - -constexpr index_t MPerBlock = CK_PARAM_MPerBlock; -constexpr index_t NPerBlock = CK_PARAM_NPerBlock; -constexpr index_t KPerBlock = CK_PARAM_KPerBlock; -constexpr index_t M1PerThread = CK_PARAM_M1PerThread; -constexpr index_t N1PerThread = CK_PARAM_N1PerThread; -constexpr index_t KPerThread = CK_PARAM_KPerThread; -constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10; -constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10; -constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11; -constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11; - -using ABlockTransferThreadSliceLengths_K_M0_M1 = - Sequence; -using ABlockTransferThreadClusterLengths_K_M0_M1 = - Sequence; -using ABlockTransferThreadClusterArrangeOrder = - Sequence; -using ABlockTransferSrcAccessOrder = Sequence; - -constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim; -constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector; -constexpr index_t ABlockTransferDstScalarPerVector_M1 = - CK_PARAM_ABlockTransferDstScalarPerVector_M1; -constexpr bool AThreadTransferSrcResetCoordinateAfterRun = - static_cast(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun); - -using BBlockTransferThreadSliceLengths_K_N0_N1 = - Sequence; -using BBlockTransferThreadClusterLengths_K_N0_N1 = - Sequence; -using BBlockTransferThreadClusterArrangeOrder = - Sequence; -using BBlockTransferSrcAccessOrder = Sequence; - -constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim; -constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector; -constexpr index_t BBlockTransferDstScalarPerVector_N1 = - CK_PARAM_BBlockTransferDstScalarPerVector_N1; -constexpr bool BThreadTransferSrcResetCoordinateAfterRun = - static_cast(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun); - -using CThreadTransferSrcDstAccessOrder = Sequence; -constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim; -constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector; - -constexpr bool HasMainKBlockLoop = static_cast(CK_PARAM_HAS_MAIN_KBLOCK_LOOP); -constexpr bool HasDoubleTailKBlockLoop = static_cast(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP); - -extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare( - int n, - int c, - int hi, - int wi, - int k, - int y, - int x, - int convStrideH, - int convStrideW, - int convDilationY, - int convDilationX, - int leftPadH, - int leftPadW, - int rightPadH, - int rightPadW, - void* p_a_k_m0_m1_grid_desc, - void* p_b_k_n0_n1_grid_desc, - void* p_c_m0_m10_m11_n0_n10_n11_grid_desc, - void* p_c_blockid_to_m0_n0_block_cluster_adaptor) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1; - const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1; - - const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi)); - const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x)); - const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo)); - - const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad( - wei_k_c_y_x_desc, - in_n_c_hi_wi_desc, - out_n_k_ho_wo_desc, - make_tuple(convStrideH, convStrideW), - make_tuple(convDilationY, convDilationX), - make_tuple(leftPadH, leftPadW), - make_tuple(rightPadH, rightPadW)); - - const auto a_k_m_grid_desc = descs[I0]; - const auto b_k_n_grid_desc = descs[I1]; - const auto c_m_n_grid_desc = descs[I2]; - - using AKMGridDesc = decltype(a_k_m_grid_desc); - using BKNGridDesc = decltype(b_k_n_grid_desc); - using CMNGridDesc = decltype(c_m_n_grid_desc); - - using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}))); - - using BGridStepHacks = - decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}))); - - using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}))); - - using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>; - using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; - - using GridwiseGemm = - GridwiseGemmDlops_km_kn_mn_v1r2; - - auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc); - auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc); - auto c_m0_m10_m11_n0_n10_n11_grid_desc = - GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc); - auto c_blockid_to_m0_n0_block_cluster_adaptor = - GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc); - - if(hipThreadIdx_x == 0) - { - *static_cast(p_a_k_m0_m1_grid_desc) = a_k_m0_m1_grid_desc; - *static_cast(p_b_k_n0_n1_grid_desc) = b_k_n0_n1_grid_desc; - *static_cast( - p_c_m0_m10_m11_n0_n10_n11_grid_desc) = c_m0_m10_m11_n0_n10_n11_grid_desc; - *static_cast( - p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor; - }; -}; - -extern "C" __global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k_m0_m1_grid_desc, - const void CONSTANT* p_b_k_n0_n1_grid_desc, - const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - constexpr auto in_n_c_hi_wi_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); - constexpr auto wei_k_c_y_x_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3)); - constexpr auto out_n_k_ho_wo_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); - - constexpr auto descs = - transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, - in_n_c_hi_wi_desc, - out_n_k_ho_wo_desc, - make_tuple(1, 1), - make_tuple(1, 1), - make_tuple(1, 1), - make_tuple(1, 1)); - - constexpr auto a_k_m_grid_desc = descs[I0]; - constexpr auto b_k_n_grid_desc = descs[I1]; - constexpr auto c_m_n_grid_desc = descs[I2]; - - using AKMGridDesc = decltype(a_k_m_grid_desc); - using BKNGridDesc = decltype(b_k_n_grid_desc); - using CMNGridDesc = decltype(c_m_n_grid_desc); - - using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}))); - - using BGridStepHacks = - decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}))); - - using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}))); - - using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>; - using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; - - using GridwiseGemm = - GridwiseGemmDlops_km_kn_mn_v1r2; - - constexpr auto a_k_m0_m1_grid_desc_tmp = - GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc); - constexpr auto b_k_n0_n1_grid_desc_tmp = - GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc); - constexpr auto c_m0_m10_m11_n0_n10_n11_grid_desc_tmp = - GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc); - constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp = - GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc); - - using AKM0M1GridDesc = decltype(a_k_m0_m1_grid_desc_tmp); - using BKN0N1GridDesc = decltype(b_k_n0_n1_grid_desc_tmp); - using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc_tmp); - using CBlockIdToM0N0BlockClusterAdaptor = - decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp); - - const auto a_k_m0_m1_grid_desc = - *reinterpret_cast((const void*)p_a_k_m0_m1_grid_desc); - const auto b_k_n0_n1_grid_desc = - *reinterpret_cast((const void*)p_b_k_n0_n1_grid_desc); - const auto c_m0_m10_m11_n0_n10_n11_grid_desc = - *reinterpret_cast( - (const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc); - const auto c_blockid_to_m0_n0_block_cluster_adaptor = - *reinterpret_cast( - (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor); - - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseGemm::Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_k_m0_m1_grid_desc, - b_k_n0_n1_grid_desc, - c_m0_m10_m11_n0_n10_n11_grid_desc, - c_blockid_to_m0_n0_block_cluster_adaptor, - integral_constant{}, - integral_constant{}); -}; diff --git a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp deleted file mode 100644 index 51d852617f88c950f71553815b9705909788e315..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp +++ /dev/null @@ -1,358 +0,0 @@ -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "gridwise_gemm_xdlops_v2r3.hpp" -#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp" - -using namespace ck; - -constexpr DataTypeEnum_t ABDataTypeEnum = static_cast(CK_PARAM_ABDataTypeEnum); -constexpr DataTypeEnum_t AccDataTypeEnum = static_cast(CK_PARAM_AccDataTypeEnum); -constexpr DataTypeEnum_t CDataTypeEnum = static_cast(CK_PARAM_CDataTypeEnum); - -using FloatAB = typename get_datatype_from_enum::type; -using FloatAcc = typename get_datatype_from_enum::type; -using FloatC = typename get_datatype_from_enum::type; - -constexpr index_t BlockSize = CK_PARAM_BlockSize; - -constexpr index_t MPerBlock = CK_PARAM_MPerBlock; -constexpr index_t NPerBlock = CK_PARAM_NPerBlock; -constexpr index_t KPerBlock = CK_PARAM_KPerBlock; - -constexpr index_t MPerWave = CK_PARAM_MPerWave; -constexpr index_t NPerWave = CK_PARAM_NPerWave; -constexpr index_t MRepeat = CK_PARAM_MRepeat; -constexpr index_t NRepeat = CK_PARAM_NRepeat; -constexpr index_t K1 = CK_PARAM_K1; - -using ABlockTransferThreadSliceLengths_K0_M_K1 = - Sequence; -using ABlockTransferThreadClusterLengths_K0_M_K1 = - Sequence; -using ABlockTransferThreadClusterArrangeOrder = - Sequence; -using ABlockTransferSrcAccessOrder = Sequence; - -constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim; -constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector; -constexpr index_t ABlockTransferDstScalarPerVector_K1 = - CK_PARAM_ABlockTransferDstScalarPerVector_K1; -constexpr bool AThreadTransferSrcResetCoordinateAfterRun = - static_cast(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun); - -using BBlockTransferThreadSliceLengths_K0_N_K1 = - Sequence; -using BBlockTransferThreadClusterLengths_K0_N_K1 = - Sequence; -using BBlockTransferThreadClusterArrangeOrder = - Sequence; -using BBlockTransferSrcAccessOrder = Sequence; - -constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim; -constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector; -constexpr index_t BBlockTransferDstScalarPerVector_K1 = - CK_PARAM_BBlockTransferDstScalarPerVector_K1; -constexpr bool BThreadTransferSrcResetCoordinateAfterRun = - static_cast(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun); - -using CThreadTransferSrcDstAccessOrder = Sequence; -constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim; -constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector; - -extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare( - int n, - int c, - int hi, - int wi, - int k, - int y, - int x, - int convStrideH, - int convStrideW, - int convDilationY, - int convDilationX, - int leftPadH, - int leftPadW, - int rightPadH, - int rightPadW, - void* p_a_k0_m_k1_grid_desc, - void* p_b_k0_n_k1_grid_desc, - void* p_c_m0_m1_m2_n_grid_desc, - void* p_c_blockid_to_m0_n0_block_cluster_adaptor) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1; - const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1; - - const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi)); - const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x)); - const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo)); - - const auto descs = transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad( - wei_k_c_y_x_desc, - in_n_c_hi_wi_desc, - out_n_k_ho_wo_desc, - make_tuple(convStrideH, convStrideW), - make_tuple(convDilationY, convDilationX), - make_tuple(leftPadH, leftPadW), - make_tuple(rightPadH, rightPadW), - Number{}); - - const auto a_k0_m_k1_grid_desc = descs[I0]; - const auto b_k0_n_k1_grid_desc = descs[I1]; - const auto c_m_n_grid_desc = descs[I2]; - - using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc); - using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc); - using CMNGridDesc = decltype(c_m_n_grid_desc); - - using AGridStepHacks = decltype(make_tuple( - make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}), - make_tuple( - Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}))); - - using BGridStepHacks = - decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}))); - - using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}))); - - using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>; - using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; - - using GridwiseGemm = - GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3; - - auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); - - auto c_blockid_to_m0_n0_block_cluster_adaptor = - GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc); - - if(hipThreadIdx_x == 0) - { - *static_cast*>(p_a_k0_m_k1_grid_desc) = - a_k0_m_k1_grid_desc; - *static_cast*>(p_b_k0_n_k1_grid_desc) = - b_k0_n_k1_grid_desc; - *static_cast(p_c_m0_m1_m2_n_grid_desc) = - c_m0_m1_m2_n_grid_desc; - *static_cast( - p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor; - } -}; - -extern "C" __global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m_k1_grid_desc, - const void CONSTANT* p_b_k0_n_k1_grid_desc, - const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) -{ - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - constexpr auto in_n_c_hi_wi_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); - constexpr auto wei_k_c_y_x_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3)); - constexpr auto out_n_k_ho_wo_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); - - constexpr auto descs = - transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, - in_n_c_hi_wi_desc, - out_n_k_ho_wo_desc, - make_tuple(1, 1), - make_tuple(1, 1), - make_tuple(1, 1), - make_tuple(1, 1), - Number{}); - - constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0]; - constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1]; - constexpr auto c_m_n_grid_desc = descs[I2]; - - using AGridStepHacks = decltype(make_tuple( - make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}), - make_tuple( - Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}))); - - using BGridStepHacks = - decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}))); - - using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}))); - - using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>; - using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; - - using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp); - using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp); - using CMNGridDesc = decltype(c_m_n_grid_desc); - - using GridwiseGemm = - GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3; - - constexpr auto c_m0_m1_m2_n_grid_desc_tmp = - GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); - constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp = - GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc); - - using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp); - using CBlockIdToM0N0BlockClusterAdaptor = - decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp); - - const auto a_k0_m_k1_grid_desc = - *reinterpret_cast((const void*)p_a_k0_m_k1_grid_desc); - const auto b_k0_n_k1_grid_desc = - *reinterpret_cast((const void*)p_b_k0_n_k1_grid_desc); - const auto c_m0_m1_m2_n_grid_desc = - *reinterpret_cast((const void*)p_c_m0_m1_m2_n_grid_desc); - const auto c_blockid_to_m0_n0_block_cluster_adaptor = - *reinterpret_cast( - (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor); - - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseGemm::Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_k0_m_k1_grid_desc, - b_k0_n_k1_grid_desc, - c_m0_m1_m2_n_grid_desc, - c_blockid_to_m0_n0_block_cluster_adaptor); -}; diff --git a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp deleted file mode 100644 index 30e4c518ced1bc79fef9af5ca3cb4a2b68764f83..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp +++ /dev/null @@ -1,357 +0,0 @@ -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "gridwise_gemm_xdlops_v2r3.hpp" -#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp" - -using namespace ck; - -constexpr DataTypeEnum_t ABDataTypeEnum = static_cast(CK_PARAM_ABDataTypeEnum); -constexpr DataTypeEnum_t AccDataTypeEnum = static_cast(CK_PARAM_AccDataTypeEnum); -constexpr DataTypeEnum_t CDataTypeEnum = static_cast(CK_PARAM_CDataTypeEnum); - -using FloatAB = typename get_datatype_from_enum::type; -using FloatAcc = typename get_datatype_from_enum::type; -using FloatC = typename get_datatype_from_enum::type; - -constexpr index_t BlockSize = CK_PARAM_BlockSize; - -constexpr index_t MPerBlock = CK_PARAM_MPerBlock; -constexpr index_t NPerBlock = CK_PARAM_NPerBlock; -constexpr index_t KPerBlock = CK_PARAM_KPerBlock; - -constexpr index_t MPerWave = CK_PARAM_MPerWave; -constexpr index_t NPerWave = CK_PARAM_NPerWave; -constexpr index_t MRepeat = CK_PARAM_MRepeat; -constexpr index_t NRepeat = CK_PARAM_NRepeat; -constexpr index_t K1 = CK_PARAM_K1; - -using ABlockTransferThreadSliceLengths_K0_M_K1 = - Sequence; -using ABlockTransferThreadClusterLengths_K0_M_K1 = - Sequence; -using ABlockTransferThreadClusterArrangeOrder = - Sequence; -using ABlockTransferSrcAccessOrder = Sequence; - -constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim; -constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector; -constexpr index_t ABlockTransferDstScalarPerVector_K1 = - CK_PARAM_ABlockTransferDstScalarPerVector_K1; -constexpr bool AThreadTransferSrcResetCoordinateAfterRun = - static_cast(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun); - -using BBlockTransferThreadSliceLengths_K0_N_K1 = - Sequence; -using BBlockTransferThreadClusterLengths_K0_N_K1 = - Sequence; -using BBlockTransferThreadClusterArrangeOrder = - Sequence; -using BBlockTransferSrcAccessOrder = Sequence; - -constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim; -constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector; -constexpr index_t BBlockTransferDstScalarPerVector_K1 = - CK_PARAM_BBlockTransferDstScalarPerVector_K1; -constexpr bool BThreadTransferSrcResetCoordinateAfterRun = - static_cast(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun); - -using CThreadTransferSrcDstAccessOrder = Sequence; -constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim; -constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector; - -extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare( - int n, - int hi, - int wi, - int c, - int k, - int y, - int x, - int convStrideH, - int convStrideW, - int convDilationY, - int convDilationX, - int leftPadH, - int leftPadW, - int rightPadH, - int rightPadW, - void* p_a_k0_m_k1_grid_desc, - void* p_b_k0_n_k1_grid_desc, - void* p_c_m0_m1_m2_n_grid_desc, - void* p_c_blockid_to_m0_n0_block_cluster_adaptor) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1; - const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1; - - const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(make_tuple(n, hi, wi, c)); - const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(make_tuple(k, y, x, c)); - const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(make_tuple(n, ho, wo, k)); - - const auto descs = transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad( - in_n_hi_wi_c_desc, - wei_k_y_x_c_desc, - out_n_ho_wo_k_desc, - make_tuple(convStrideH, convStrideW), - make_tuple(convDilationY, convDilationX), - make_tuple(leftPadH, leftPadW), - make_tuple(rightPadH, rightPadW), - Number{}); - - const auto a_k0_m_k1_grid_desc = descs[I0]; - const auto b_k0_n_k1_grid_desc = descs[I1]; - const auto c_m_n_grid_desc = descs[I2]; - - using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc); - using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc); - using CMNGridDesc = decltype(c_m_n_grid_desc); - - using BGridStepHacks = decltype(make_tuple( - make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}), - make_tuple( - Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}))); - - using AGridStepHacks = - decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}))); - - using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}))); - - using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; - using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>; - - using GridwiseGemm = - GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3; - - auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); - - auto c_blockid_to_m0_n0_block_cluster_adaptor = - GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc); - - if(hipThreadIdx_x == 0) - { - *static_cast*>(p_a_k0_m_k1_grid_desc) = - a_k0_m_k1_grid_desc; - *static_cast*>(p_b_k0_n_k1_grid_desc) = - b_k0_n_k1_grid_desc; - *static_cast(p_c_m0_m1_m2_n_grid_desc) = - c_m0_m1_m2_n_grid_desc; - *static_cast( - p_c_blockid_to_m0_n0_block_cluster_adaptor) = c_blockid_to_m0_n0_block_cluster_adaptor; - } -}; - -extern "C" __global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m_k1_grid_desc, - const void CONSTANT* p_b_k0_n_k1_grid_desc, - const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) -{ - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - constexpr auto in_n_hi_wi_c_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256)); - constexpr auto wei_k_y_x_c_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 3, 3, 256)); - constexpr auto out_n_ho_wo_k_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256)); - - constexpr auto descs = - transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc, - wei_k_y_x_c_desc, - out_n_ho_wo_k_desc, - make_tuple(1, 1), - make_tuple(1, 1), - make_tuple(1, 1), - make_tuple(1, 1), - Number{}); - - constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0]; - constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1]; - constexpr auto c_m_n_grid_desc = descs[I2]; - - using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp); - using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp); - using CMNGridDesc = decltype(c_m_n_grid_desc); - - using BGridStepHacks = decltype(make_tuple( - make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}), - make_tuple( - Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}))); - - using AGridStepHacks = - decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}))); - - using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 1, 0, 0>{}), - make_tuple(Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 0, 0, 0>{}, - Sequence<0, 0, 2, 0, 0>{}))); - - using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>; - using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>; - - using GridwiseGemm = - GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3; - constexpr auto c_m0_m1_m2_n_grid_desc_tmp = - GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); - constexpr auto c_blockid_to_m0_n0_block_cluster_adaptor_tmp = - GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc); - - using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp); - using CBlockIdToM0N0BlockClusterAdaptor = - decltype(c_blockid_to_m0_n0_block_cluster_adaptor_tmp); - - const auto a_k0_m_k1_grid_desc = - *reinterpret_cast((const void*)p_a_k0_m_k1_grid_desc); - const auto b_k0_n_k1_grid_desc = - *reinterpret_cast((const void*)p_b_k0_n_k1_grid_desc); - const auto c_m0_m1_m2_n_grid_desc = - *reinterpret_cast((const void*)p_c_m0_m1_m2_n_grid_desc); - const auto c_blockid_to_m0_n0_block_cluster_adaptor = - *reinterpret_cast( - (const void*)p_c_blockid_to_m0_n0_block_cluster_adaptor); - - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseGemm::Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_k0_m_k1_grid_desc, - b_k0_n_k1_grid_desc, - c_m0_m1_m2_n_grid_desc, - c_blockid_to_m0_n0_block_cluster_adaptor); -}; diff --git a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp deleted file mode 100644 index 71239e0ecc9e732ee1de34ece8953fea510e3584..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp +++ /dev/null @@ -1,400 +0,0 @@ -#include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" -#include "gridwise_contraction_dlops_v1r2.hpp" -#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp" - -using namespace ck; - -constexpr DataTypeEnum_t ABDataTypeEnum = static_cast(CK_PARAM_ABDataTypeEnum); -constexpr DataTypeEnum_t AccDataTypeEnum = static_cast(CK_PARAM_AccDataTypeEnum); -constexpr DataTypeEnum_t CDataTypeEnum = static_cast(CK_PARAM_CDataTypeEnum); - -using FloatAB = typename get_datatype_from_enum::type; -using FloatAcc = typename get_datatype_from_enum::type; -using FloatC = typename get_datatype_from_enum::type; - -constexpr index_t BlockSize = CK_PARAM_BlockSize; - -constexpr auto GN0 = Number{}; -constexpr auto GK1 = Number{}; - -constexpr index_t GM1PerBlockGM11 = CK_PARAM_GM1PerBlockGM11; -constexpr index_t GN1PerBlockGN11 = CK_PARAM_GN1PerBlockGN11; -constexpr index_t GK0PerBlock = CK_PARAM_GK0PerBlock; - -constexpr index_t BM1PerThreadBM11 = CK_PARAM_BM1PerThreadBM11; -constexpr index_t BN1PerThreadBN11 = CK_PARAM_BN1PerThreadBN11; -constexpr index_t BK0PerThread = CK_PARAM_BK0PerThread; - -using BM10BN10ThreadClusterBM10Xs = Sequence; -using BM10BN10ThreadClusterBN10Xs = Sequence; - -using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = - Sequence; -using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = - Sequence; -using ABlockTransferThreadClusterArrangeOrder = Sequence<1, 2, 3, 0, 4>; -using ABlockTransferSrcAccessOrder = Sequence<3, 2, 1, 0, 4>; -using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = - Sequence; -using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = - Sequence; -using ABlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>; - -using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = - Sequence; -using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = - Sequence; -using BBlockTransferThreadClusterArrangeOrder = Sequence<0, 4, 1, 2, 3>; -using BBlockTransferSrcAccessOrder = Sequence<4, 3, 2, 0, 1>; -using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = - Sequence; -using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = - Sequence; -using BBlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>; - -using CThreadTransferSrcDstAccessOrder = Sequence<3, 4, 5, 0, 1, 2>; -constexpr index_t CThreadTransferSrcDstVectorDim = 5; -constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector; - -constexpr bool HasMainKBlockLoop = static_cast(CK_PARAM_HasMainKBlockLoop); -constexpr bool HasDoubleTailKBlockLoop = static_cast(CK_PARAM_HasDoubleTailKBlockLoop); - -extern "C" __global__ void -convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(int N_, - int C_, - int Hi_, - int Wi_, - int K_, - int Y_, - int X_, - int ConvStrideH_, - int ConvStrideW_, - int ConvDilationH_, - int ConvDilationW_, - int InLeftPadH_, - int InLeftPadW_, - int InRightPadH_, - int InRightPadW_, - void* p_desc_tuple) -{ - index_t N = static_cast(N_); - index_t C = static_cast(C_); - index_t Hi = static_cast(Hi_); - index_t Wi = static_cast(Wi_); - index_t K = static_cast(K_); - index_t Y = static_cast(Y_); - index_t X = static_cast(X_); - index_t ConvStrideH = static_cast(ConvStrideH_); - index_t ConvStrideW = static_cast(ConvStrideW_); - index_t ConvDilationH = static_cast(ConvDilationH_); - index_t ConvDilationW = static_cast(ConvDilationW_); - index_t InLeftPadH = static_cast(InLeftPadH_); - index_t InLeftPadW = static_cast(InLeftPadW_); - index_t InRightPadH = static_cast(InRightPadH_); - index_t InRightPadW = static_cast(InRightPadW_); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - const index_t Ho = - (Hi + InLeftPadH + InRightPadH - ConvDilationH * (Y - 1) - 1) / ConvStrideH + 1; - const index_t Wo = - (Wi + InLeftPadW + InRightPadW - ConvDilationW * (X - 1) - 1) / ConvStrideW + 1; - - const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C, Hi, Wi)); - const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(K, C, Y, X)); - const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo)); - - const auto descs = transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad( - wei_k_c_y_x_desc, - in_n_c_hi_wi_desc, - out_n_k_ho_wo_desc, - make_tuple(ConvStrideH, ConvStrideW), - make_tuple(ConvDilationH, ConvDilationW), - make_tuple(InLeftPadH, InLeftPadW), - make_tuple(InRightPadH, InRightPadW), - GN0, - GK1); - - const auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0]; - const auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1]; - const auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2]; - - using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1); - using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1); - using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1); - - using AGridStepHacks = - decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 0+: GK0 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 1+: GM0 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 2+: GM10 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 3+: GM11 - Sequence<0, 0, 0, 0, 0, 0, 0>{}), // 4+: GK1 - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 0-: GK0 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 1-: GM0 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 2-: GM10 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 3-: GM11 - Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1 - - using BGridStepHacks = decltype(make_tuple( - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: GK0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 1+: GN0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 2+: GN10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 3+: GN11 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), // 4+: GK1 - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: GK0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 1-: GN0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 2-: GN10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 3-: GN11 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1 - - using CGridStepHacks = decltype(make_tuple( - make_tuple( - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: GM10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, // 1+: BM0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, // 2+: BM1 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3+: GN10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}, // 4+: BN0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1 - make_tuple( - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: GM10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, // 1-: BM0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, // 2-: BM1 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3-: GN10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}, // 4-: BN0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1 - - using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>; - - using BGridMoveSliceWindowStepHacks = - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>; - - using GridwiseContraction = - GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1< - BlockSize, - FloatAB, - FloatAcc, - FloatC, - InMemoryDataOperationEnum_t::Set, - AGridDesc_GK0_GM0_GM1_GK1, - BGridDesc_GK0_GN0_GN1_GK1, - CGridDesc_GM0_GM1_GN0_GN1, - GM1PerBlockGM11, - GN1PerBlockGN11, - GK0PerBlock, - BM1PerThreadBM11, - BN1PerThreadBN11, - BK0PerThread, - BM10BN10ThreadClusterBM10Xs, - BM10BN10ThreadClusterBN10Xs, - ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1, - ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1, - ABlockTransferThreadClusterArrangeOrder, - ABlockTransferSrcAccessOrder, - ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, - ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, - ABlockTransferSrcVectorTensorContiguousDimOrder, - BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1, - BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1, - BBlockTransferThreadClusterArrangeOrder, - BBlockTransferSrcAccessOrder, - BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, - BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, - BBlockTransferSrcVectorTensorContiguousDimOrder, - CThreadTransferSrcDstAccessOrder, - CThreadTransferSrcDstVectorDim, - CThreadTransferDstScalarPerVector, - AGridStepHacks, - BGridStepHacks, - CGridStepHacks, - AGridMoveSliceWindowStepHacks, - BGridMoveSliceWindowStepHacks>; - - if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0) - { - auto desc_tuple = - make_tuple(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1( - a_grid_desc_gk0_gm0_gm1_gk1), - GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1( - b_grid_desc_gk0_gn0_gn1_gk1), - GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1( - c_grid_desc_gm0_gm1_gn0_gn1), - GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10( - c_grid_desc_gm0_gm1_gn0_gn1)); - - *static_cast(p_desc_tuple) = desc_tuple; - } -}; - -extern "C" __global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_desc_tuple) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_n_c_hi_wi_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); - constexpr auto wei_k_c_y_x_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3)); - constexpr auto out_n_k_ho_wo_desc = - make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28)); - - constexpr auto descs = - transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc, - in_n_c_hi_wi_desc, - out_n_k_ho_wo_desc, - make_tuple(1, 1), - make_tuple(1, 1), - make_tuple(1, 1), - make_tuple(1, 1), - GN0, - GK1); - - constexpr auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0]; - constexpr auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1]; - constexpr auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2]; - - using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1); - using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1); - using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1); - - using AGridStepHacks = - decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 0+: GK0 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 1+: GM0 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 2+: GM10 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 3+: GM11 - Sequence<0, 0, 0, 0, 0, 0, 0>{}), // 4+: GK1 - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 0-: GK0 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 1-: GM0 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 2-: GM10 - Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 3-: GM11 - Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1 - - using BGridStepHacks = decltype(make_tuple( - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: GK0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 1+: GN0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 2+: GN10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 3+: GN11 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), // 4+: GK1 - make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: GK0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 1-: GN0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 2-: GN10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 3-: GN11 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1 - - using CGridStepHacks = decltype(make_tuple( - make_tuple( - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: GM10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, // 1+: BM0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, // 2+: BM1 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3+: GN10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}, // 4+: BN0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1 - make_tuple( - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: GM10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, // 1-: BM0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, // 2-: BM1 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3-: GN10 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}, // 4-: BN0 - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1 - - using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>; - - using BGridMoveSliceWindowStepHacks = - Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>; - - using GridwiseContraction = - GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1< - BlockSize, - FloatAB, - FloatAcc, - FloatC, - InMemoryDataOperationEnum_t::Set, - AGridDesc_GK0_GM0_GM1_GK1, - BGridDesc_GK0_GN0_GN1_GK1, - CGridDesc_GM0_GM1_GN0_GN1, - GM1PerBlockGM11, - GN1PerBlockGN11, - GK0PerBlock, - BM1PerThreadBM11, - BN1PerThreadBN11, - BK0PerThread, - BM10BN10ThreadClusterBM10Xs, - BM10BN10ThreadClusterBN10Xs, - ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1, - ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1, - ABlockTransferThreadClusterArrangeOrder, - ABlockTransferSrcAccessOrder, - ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, - ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, - ABlockTransferSrcVectorTensorContiguousDimOrder, - BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1, - BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1, - BBlockTransferThreadClusterArrangeOrder, - BBlockTransferSrcAccessOrder, - BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, - BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, - BBlockTransferSrcVectorTensorContiguousDimOrder, - CThreadTransferSrcDstAccessOrder, - CThreadTransferSrcDstVectorDim, - CThreadTransferDstScalarPerVector, - AGridStepHacks, - BGridStepHacks, - CGridStepHacks, - AGridMoveSliceWindowStepHacks, - BGridMoveSliceWindowStepHacks>; - - using AGridDesc_GK0_GM0_GM10_GM11_GK1 = - decltype(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1( - a_grid_desc_gk0_gm0_gm1_gk1)); - using BGridDesc_GK0_GN0_GN10_GN11_GK1 = - decltype(GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1( - b_grid_desc_gk0_gn0_gn1_gk1)); - using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 = - decltype(GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1( - c_grid_desc_gm0_gm1_gn0_gn1)); - using CGridBlockCluster_BlockId_To_GM10_GN10 = - decltype(GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10( - c_grid_desc_gm0_gm1_gn0_gn1)); - - using DescTuple = decltype(make_tuple(AGridDesc_GK0_GM0_GM10_GM11_GK1{}, - BGridDesc_GK0_GN0_GN10_GN11_GK1{}, - CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1{}, - CGridBlockCluster_BlockId_To_GM10_GN10{})); - - const auto desc_tuple = - *reinterpret_cast(cast_pointer_to_generic_address_space(p_desc_tuple)); - - const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = desc_tuple[I0]; - const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = desc_tuple[I1]; - const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = desc_tuple[I2]; - const auto c_grid_block_cluster_blockid_to_gm10_gn10 = desc_tuple[I3]; - - constexpr index_t shared_block_size = - GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - __shared__ FloatAB p_shared_block[shared_block_size]; - - GridwiseContraction::Run(p_a_grid, - p_b_grid, - p_c_grid, - p_shared_block, - a_grid_desc_gk0_gm0_gm10_gm11_gk1, - b_grid_desc_gk0_gn0_gn10_gn11_gk1, - c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1, - c_grid_block_cluster_blockid_to_gm10_gn10, - integral_constant{}, - integral_constant{}); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp deleted file mode 100644 index ca6b415910e4c113d74c5a6f625fabd83a818dd8..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp +++ /dev/null @@ -1,271 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_blockwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto one_dim_srcDesc = transform_tensor_descriptor( - srcDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - auto src2dDesc = transform_tensor_descriptor( - one_dim_srcDesc, - make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - constexpr int invariantLen = 1; - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - - if constexpr(src2d_need_padding) - { - const auto srcPad = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); - - static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_src2dDesc = - transform_tensor_descriptor(ref_one_dim_srcDesc, - make_tuple(make_unmerge_transform( - make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp deleted file mode 100644 index a3daeaf163925b9b9de04805d3a3cddf0dac4dff..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,305 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_blockwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); - - auto src2dDesc = - transform_tensor_descriptor(srcDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(toReduceDimLengths)), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - - if constexpr(src2d_need_padding) - { - const auto srcPad = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_toReduceDimLengths = - typename uniform_sequence_gen::type{}; - static constexpr auto ref_invariantDimLengths = - typename uniform_sequence_gen::type{}; - - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); - - static constexpr auto ref_src2dDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), - make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); -}; - -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp deleted file mode 100644 index 81899dfb021fa31a9562731fff14fc39d72f4b5f..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp +++ /dev/null @@ -1,276 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_multiblock.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto one_dim_srcDesc = transform_tensor_descriptor( - srcDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - auto src2dDesc = transform_tensor_descriptor( - one_dim_srcDesc, - make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - constexpr int invariantLen = 1; - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - const index_t reduceSizePerBlock = - (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) * - copySliceLen; - - if constexpr(src2d_need_padding) - { - const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); - - static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_src2dDesc = - transform_tensor_descriptor(ref_one_dim_srcDesc, - make_tuple(make_unmerge_transform( - make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_dst_global; - (void)indices_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - BlkGroupSize, - alpha, - static_cast(p_src_global), - beta, - static_cast(ws_buf1_global), - static_cast(ws_buf2_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp deleted file mode 100644 index 0e578f4d1d8e5b9120e01eaf67c36442dc01af38..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp +++ /dev/null @@ -1,310 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_multiblock.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); - - auto src2dDesc = - transform_tensor_descriptor(srcDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(toReduceDimLengths)), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - const index_t reduceSizePerBlock = - (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) * - copySliceLen; - - if constexpr(src2d_need_padding) - { - const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_toReduceDimLengths = - typename uniform_sequence_gen::type{}; - static constexpr auto ref_invariantDimLengths = - typename uniform_sequence_gen::type{}; - - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); - - static constexpr auto ref_src2dDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), - make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); -}; - -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_dst_global; - (void)indices_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - BlkGroupSize, - alpha, - static_cast(p_src_global), - beta, - static_cast(ws_buf1_global), - static_cast(ws_buf2_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp deleted file mode 100644 index e63a1254e4d872e2073eb0e61d091c1b43b3fe54..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp +++ /dev/null @@ -1,284 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto one_dim_srcDesc = transform_tensor_descriptor( - srcDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - auto src2dDesc = transform_tensor_descriptor( - one_dim_srcDesc, - make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - constexpr int invariantLen = 1; - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = GredThreadBufferLength; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dstdDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); - - static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_src2dDesc = - transform_tensor_descriptor(ref_one_dim_srcDesc, - make_tuple(make_unmerge_transform( - make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp deleted file mode 100644 index 698f740058fa166fa9cf3d38a2efb339efbe2a16..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,318 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); - - auto src2dDesc = - transform_tensor_descriptor(srcDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(toReduceDimLengths)), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = GredThreadBufferLength; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_toReduceDimLengths = - typename uniform_sequence_gen::type{}; - static constexpr auto ref_invariantDimLengths = - typename uniform_sequence_gen::type{}; - - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); - - static constexpr auto ref_src2dDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), - make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); -}; - -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp deleted file mode 100644 index 4a607372e951c3d31250379a9a00ac0933011a53..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp +++ /dev/null @@ -1,285 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto one_dim_srcDesc = transform_tensor_descriptor( - srcDesc, - make_tuple(make_merge_transform(tupleSrcLengths)), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - auto src2dDesc = transform_tensor_descriptor( - one_dim_srcDesc, - make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - constexpr int invariantLen = 1; - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dstDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1)); - - static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))), - make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_src2dDesc = - transform_tensor_descriptor(ref_one_dim_srcDesc, - make_tuple(make_unmerge_transform( - make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = - GridwiseReduction_xy_to_x_direct_warpwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp deleted file mode 100644 index a64152790069f1711384b81fa126fa20cfd2e0a6..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,320 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t srcDims = CK_PARAM_IN_DIMS; -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS; -constexpr index_t num_invariantDims = srcDims - num_toReduceDims; - -using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type; -using toReduceDims = typename arithmetic_sequence_gen::type; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!"); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize, - int BlkGroupSize, - int inLength0, - int inLength1, - int inLength2, - int inLength3, - int inLength4, - int inLength5, - int inStride0, - int inStride1, - int inStride2, - int inStride3, - int inStride4, - int inStride5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5}; - const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number{}); - const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{}); - const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(srcLengths, invariantDims{}); - - auto src2dDesc = - transform_tensor_descriptor(srcDesc, - make_tuple(make_merge_transform(invariantDimLengths), - make_merge_transform(toReduceDimLengths)), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const auto invariantLen = src2dDesc.GetLength(Number<0>{}); - const auto toReduceLen = src2dDesc.GetLength(Number<1>{}); - - constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_toReduceDimLengths = - typename uniform_sequence_gen::type{}; - static constexpr auto ref_invariantDimLengths = - typename uniform_sequence_gen::type{}; - - static constexpr auto ref_srcLengths = typename uniform_sequence_gen::type{}; - static constexpr auto ref_dstLengths = typename uniform_sequence_gen::type{}; - - // don't have to use accurate strides to get an expected referrence type - static constexpr auto ref_srcDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths)); - static constexpr auto ref_dstDesc = make_naive_tensor_descriptor( - make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths)); - - static constexpr auto ref_src2dDesc = transform_tensor_descriptor( - ref_srcDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)), - make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))), - make_tuple(invariantDims{}, toReduceDims{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{}); - static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{}); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); -}; - -using refType_src2dDesc = - typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = - typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types:: - refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = - typename get_ref_desc_types:: - refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen, - int BlkGroupSize, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)BlkGroupSize; - (void)ws_buf2_bytes_offset; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = - GridwiseReduction_xy_to_x_direct_warpwise; - - constexpr int RunId = need_indices ? 2 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(p_src_global), - beta, - static_cast(p_dst_global), - static_cast(nullptr), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp deleted file mode 100644 index 7e9d46612ef4757386880f9b8be507addad85527..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp +++ /dev/null @@ -1,205 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_blockwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -extern "C" __global__ void -gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global) -{ - (void)GridSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const index_t invariantLen = dstDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - - if constexpr(src2d_need_padding) - { - const auto srcPad = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; -}; - -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = make_tuple(8); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp deleted file mode 100644 index 3f37d01e21ee8eae6afa554930813bf267fa654a..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,263 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_blockwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, - int BlkGroupSize, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)GridSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const index_t invariantLen = dst1dDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock; - - if constexpr(src2d_need_padding) - { - const auto srcPad = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pass_through_transform(invariantLen), - make_pad_transform(toReduceLen, 0, srcPad)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = - make_tuple_from_seq(typename uniform_sequence_gen::type{}); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(ref_tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); - - // used by the BlockWise and MultiBlock method - using refType_src2dDesc_padded_34 = decltype( - transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pass_through_transform(ref_invariantLen), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_34 = - typename get_ref_desc_types::refType_src2dDesc_padded_34; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp deleted file mode 100644 index 77841d1312bfbf1b4e93a61cf11d1714585953e1..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp +++ /dev/null @@ -1,222 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -using toReduceDims = Sequence; -using invariantDims = Sequence; // this could be empty - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable - -extern "C" __global__ void -gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const index_t invariantLen = dstDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = GredThreadBufferLength; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dstDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; - } -}; - -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = make_tuple(8); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp deleted file mode 100644 index 2de461ad0fafb61075e9205325d02143d0c05d09..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,277 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_threadwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, - int BlkGroupSize, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const index_t invariantLen = dst1dDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = GredThreadBufferLength; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = - make_tuple_from_seq(typename uniform_sequence_gen::type{}); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(ref_tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp deleted file mode 100644 index 1ba5e4965790c5b3cbb06009a295a683de0c33a4..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable - -extern "C" __global__ void -gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const auto tupleDstLengths = make_tuple(1); - const auto tupleDstStrides = make_tuple(1); - - auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - const index_t invariantLen = dstDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dstDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dstDesc; - } -}; - -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = make_tuple(8); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dstDesc); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dstDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = - GridwiseReduction_xy_to_x_direct_warpwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp deleted file mode 100644 index aef1545f1189a5a486b31b8306bdda59649eea41..0000000000000000000000000000000000000000 --- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_partial_dims.cpp +++ /dev/null @@ -1,279 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include "config.hpp" -#include "number.hpp" -#include "sequence.hpp" -#include "tensor_descriptor_helper.hpp" -#include "data_type_enum_helper.hpp" -#include "reduction_common.hpp" -#include "gridwise_generic_2d_reduction_direct_warpwise.hpp" - -using namespace ck; - -using srcDataType = - typename get_datatype_from_enum(CK_PARAM_SRC_DATATYPE)>::type; -using dstDataType = - typename get_datatype_from_enum(CK_PARAM_DST_DATATYPE)>::type; -using compType = - typename get_datatype_from_enum(CK_PARAM_REDUCE_COMPTYPE)>::type; - -constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable - -constexpr index_t dstDims = CK_PARAM_OUT_DIMS; - -constexpr ReduceTensorOp_t op = static_cast(CK_PARAM_REDUCE_OP); -constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0 - ? NanPropagation_t::NOT_PROPAGATE_NAN - : NanPropagation_t::PROPAGATE_NAN; -constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0 - ? ReduceTensorIndices_t::NO_INDICES - : ReduceTensorIndices_t::FLATTENED_INDICES; - -constexpr bool src2d_need_padding = static_cast(CK_PARAM_SRC2D_PADDING); -constexpr bool dst1d_need_padding = static_cast(CK_PARAM_DST1D_PADDING); - -constexpr bool indexable = reduce_binary_operator::indexable; -constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES); - -constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable - -// helper functions using variadic template arguments -template -__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence) -{ - return make_tuple(static_cast(lengths[Ns])...); -}; - -template -__device__ static auto make_tuple_from_array(const int* lengths, Number) -{ - static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions"); - - constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{}; - - return make_tuple_from_array_and_index_seq(lengths, index_seq); -}; - -template -__device__ static constexpr auto make_tuple_from_seq(Sequence) -{ - return make_tuple(Ns...); -}; - -extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize, - int BlkGroupSize, - int outLength0, - int outLength1, - int outLength2, - int outLength3, - int outLength4, - int outLength5, - int outStride0, - int outStride1, - int outStride2, - int outStride3, - int outStride4, - int outStride5, - void* __restrict__ ws_global) -{ - (void)BlkGroupSize; - - void* p_src2dDesc = ws_global; - void* p_dst1dDesc = static_cast(ws_global) + 2048; - - const int dstLengths[6] = { - outLength0, outLength1, outLength2, outLength3, outLength4, outLength5}; - const int dstStrides[6] = { - outStride0, outStride1, outStride2, outStride3, outStride4, outStride5}; - - const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number{}); - - const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); - - auto dst1dDesc = transform_tensor_descriptor( - dstDesc, - make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - const index_t invariantLen = dst1dDesc.GetLength(Number<0>{}); - const index_t toReduceLen = BlkGroupSize; - - auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen)); - - constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp; - - if constexpr(src2d_need_padding) - { - const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen; - const auto srcPad2 = - ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen; - - auto src2dDesc_2 = - transform_tensor_descriptor(src2dDesc, - make_tuple(make_pad_transform(invariantLen, 0, srcPad1), - make_pad_transform(toReduceLen, 0, srcPad2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_src2dDesc) = src2dDesc; - } - - if constexpr(dst1d_need_padding) - { - const auto dstPad = GridSize * BlockSize / warpSize - invariantLen; - auto dst1dDesc_2 = - transform_tensor_descriptor(dst1dDesc, - make_tuple(make_pad_transform(invariantLen, 0, dstPad)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc_2; - } - else - { - if(get_thread_local_1d_id() == 0) - *static_cast(p_dst1dDesc) = dst1dDesc; - } -}; - -template -struct get_ref_desc_types -{ - static constexpr auto ref_tupleDstLengths = - make_tuple_from_seq(typename uniform_sequence_gen::type{}); - static constexpr auto ref_dstDesc = - make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths); - - static constexpr auto ref_dst1dDesc = transform_tensor_descriptor( - ref_dstDesc, - make_tuple(make_merge_transform(ref_tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}), - make_tuple(Sequence<0>{})); - - static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{}); - static constexpr index_t ref_toReduceLen = 8; - - static constexpr auto ref_src2dDesc = - make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen)); - - using refType_src2dDesc = decltype(ref_src2dDesc); - using refType_dst1dDesc = decltype(ref_dst1dDesc); - - // used by the DirectThreadWise and DirectWarpWise method - using refType_src2dDesc_padded_12 = - decltype(transform_tensor_descriptor(ref_src2dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2), - make_pad_transform(ref_toReduceLen, 0, 2)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}))); - - using refType_dst1dDesc_padded = - decltype(transform_tensor_descriptor(ref_dst1dDesc, - make_tuple(make_pad_transform(ref_invariantLen, 0, 2)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{}))); -}; - -using refType_src2dDesc = typename get_ref_desc_types::refType_src2dDesc; -using refType_dst1dDesc = typename get_ref_desc_types::refType_dst1dDesc; -using refType_src2dDesc_padded_12 = - typename get_ref_desc_types::refType_src2dDesc_padded_12; -using refType_dst1dDesc_padded = typename get_ref_desc_types::refType_dst1dDesc_padded; - -template -static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_src2dDesc)); - else - return (*reinterpret_cast(p_src2dDesc)); -}; - -template -static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc) -{ - if constexpr(need_padding) - return (*reinterpret_cast(p_dst1dDesc)); - else - return (*reinterpret_cast(p_dst1dDesc)); -}; - -extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen, - float alpha, - const void* __restrict__ p_src_global, - float beta, - void* __restrict__ p_dst_global, - const void CONSTANT* ws_global, - long ws_buf2_bytes_offset, - void* __restrict__ indices_global) -{ - (void)p_src_global; - - const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global); - const void* p_dst1dDesc = static_cast(p_src2dDesc) + 2048; - void* ws_buf1_global = const_cast(static_cast(p_src2dDesc) + 4096); - - const auto src2dDesc = get_reduction_src2d_descriptor(p_src2dDesc); - const auto dst1dDesc = get_reduction_dst1d_descriptor(p_dst1dDesc); - - using gridwise_2d_reduce = - GridwiseReduction_xy_to_x_direct_warpwise; - - void* const ws_buf2_global = - ws_buf2_bytes_offset > 0 - ? static_cast(static_cast(ws_buf1_global) + ws_buf2_bytes_offset) - : nullptr; - - constexpr int RunId = need_indices ? 3 : 1; - gridwise_2d_reduce::template Run( - src2dDesc, - dst1dDesc, - origReduceLen, - alpha, - static_cast(ws_buf1_global), - beta, - static_cast(p_dst_global), - static_cast(ws_buf2_global), - static_cast(indices_global)); -}; diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e7b9f01e1cae3901bd0f1661f39bc383300ebd6 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,3 @@ +ROCmSoftwarePlatform/rocm-recipes +RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build +danmar/cppcheck@2.9 \ No newline at end of file diff --git a/doc/image/ck_component.png b/doc/image/ck_component.png new file mode 100644 index 0000000000000000000000000000000000000000..db892331d77273208f861eb68f1df46d0f2667f4 Binary files /dev/null and b/doc/image/ck_component.png differ diff --git a/doc/image/ck_layer.png b/doc/image/ck_layer.png new file mode 100644 index 0000000000000000000000000000000000000000..117a1b3a0ef890a0a2db9bf23f14f03278d1d965 Binary files /dev/null and b/doc/image/ck_layer.png differ diff --git a/doc/markdown/dockerhub.md b/doc/markdown/dockerhub.md new file mode 100644 index 0000000000000000000000000000000000000000..91b6cb2295cf4a8dd08ead2eb165119c74996084 --- /dev/null +++ b/doc/markdown/dockerhub.md @@ -0,0 +1,93 @@ +## CK docker hub + +[Docker hub](https://hub.docker.com/r/rocm/composable_kernel) + +## Why do I need this? + +To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images. + +## So what is Composable Kernel? + +Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++. + +To get the CK library + +``` +git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git +``` + +run a docker container + +``` +docker run \ +-it \ +--privileged \ +--group-add sudo \ +-w /root/workspace \ +-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ +rocm/composable_kernel:ck_ub20.04_rocm5.3_release \ +/bin/bash +``` + +and build the CK + +``` +mkdir build && cd build + +# Need to specify target ID, example below is for gfx908 and gfx90a +cmake \ +-D CMAKE_PREFIX_PATH=/opt/rocm \ +-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ +-D CMAKE_CXX_FLAGS="-O3" \ +-D CMAKE_BUILD_TYPE=Release \ +-D GPU_TARGETS="gfx908;gfx90a" \ +.. +``` + +and + +``` +make -j examples tests +``` + +To run all the test cases including tests and examples run + +``` +make test +``` + +We can also run specific examples or tests like + +``` +./bin/example_gemm_xdl_fp16 +./bin/test_gemm_fp16 +``` + +For more details visit [CK github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel), [CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example), [even more CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example). + +## And what is inside? + +The docker images have everything you need for running CK including: + +* [ROCm](https://www.amd.com/en/graphics/servers-solutions-rocm) +* [CMake](https://cmake.org/) +* [Compiler](https://github.com/RadeonOpenCompute/llvm-project) + +## Which image is right for me? + +Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are: + +* "ck" - made for running Composable Kernel +* "ub20.04" - based on Ubuntu 20.04 +* "rocm5.4" - ROCm platform version 5.4 +* "release" - compiler version is release + +So just pick the right image for your project dependencies and you're all set. + +## DIY starts here + +If you need to customize a docker image or just can't stop tinkering, feel free to adjust the [Dockerfile](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile) for your needs. + +## License + +CK is released under the MIT [license](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/LICENSE). diff --git a/doc/markdown/tutorial_hello_world.md b/doc/markdown/tutorial_hello_world.md new file mode 100644 index 0000000000000000000000000000000000000000..297df10b5c65eaf2058c4fa436a4008c42cbf440 --- /dev/null +++ b/doc/markdown/tutorial_hello_world.md @@ -0,0 +1,191 @@ +## CK Hello world + +## Motivation + +This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who would like to optimize their pipelines and squeeze every performance drop by adding Composable Kernel (CK) library to their projects. We would like to make the CK library approachable so the tutorial is not based on the latest release and doesn't have all the bleeding edge features, but it will be reproducible now and forever. + +During this tutorial we will have an introduction to the CK library, we will build it and run some examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go in depth and breadth and get familiar with other tools and ways to integrate CK into your project. + +## Description + +Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create new ones. The library has components required for majority of modern neural networks architectures including matrix multiplication, convolution, contraction, reduction, attention modules, variety of activation functions, fused operators and many more. + +So how do we (almost) reach the speed of light? CK acceleration abilities are based on: + +* Layered structure. +* Tile-based computation model. +* Tensor coordinate transformation. +* Hardware acceleration use. +* Support of low precision data types including fp16, bf16, int8 and int4. + +If you are excited and need more technical details and benchmarking results - read this awesome blog [post](https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224). + +For more details visit our [github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel). + +## Hardware targets + +CK library fully supports "gfx908" and "gfx90a" GPU architectures and only some operators are supported for "gfx1030". Let's check the hardware you have at hand and decide on the target GPU architecture + +GPU Target AMD GPU +gfx908 Radeon Instinct MI100 +gfx90a Radeon Instinct MI210, MI250, MI250X +gfx1030 Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT + +There are also [cloud options](https://aws.amazon.com/ec2/instance-types/g4/) you can find if you don't have an AMD GPU at hand. + +## Build the library + +First let's clone the library and rebase to the tested version: + +``` +git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git +cd composable_kernel/ +git checkout tutorial_hello_world +``` + +To make our lives easier we prepared [docker images](https://hub.docker.com/r/rocm/composable_kernel) with all the necessary dependencies. Pick the right image and create a container. In this tutorial we use "rocm/composable_kernel:ck_ub20.04_rocm5.3_release" image, it is based on Ubuntu 20.04, ROCm v5.3, compiler release version. + +If your current folder is ${HOME}, start the docker container with + +``` +docker run \ +-it \ +--privileged \ +--group-add sudo \ +-w /root/workspace \ +-v ${HOME}:/root/workspace \ +rocm/composable_kernel:ck_ub20.04_rocm5.3_release \ +/bin/bash +``` + +If your current folder is different from ${HOME}, adjust the line `-v ${HOME}:/root/workspace` to fit your folder structure. + +Inside the docker container current folder is "~/workspace", library path is "~/workspace/composable_kernel", navigate to the library + +``` +cd composable_kernel/ +``` + +Create and go to the "build" directory + +``` +mkdir build && cd build +``` + +In the previous section we talked about target GPU architecture. Once you decide which one is right for you, run cmake using the right GPU_TARGETS flag + +``` +cmake \ +-D CMAKE_PREFIX_PATH=/opt/rocm \ +-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ +-D CMAKE_CXX_FLAGS="-O3" \ +-D CMAKE_BUILD_TYPE=Release \ +-D BUILD_DEV=OFF \ +-D GPU_TARGETS="gfx908;gfx90a;gfx1030" .. +``` + +If everything went well the cmake run will end up with: + +``` +-- Configuring done +-- Generating done +-- Build files have been written to: "/root/workspace/composable_kernel/build" +``` + +Finally, we can build examples and tests + +``` +make -j examples tests +``` + +If everything is smooth, you'll see + +``` +Scanning dependencies of target tests +[100%] Built target tests +``` + +## Run examples and tests + +Examples are listed as test cases as well, so we can run all examples and tests with + +``` +ctest +``` + +You can check the list of all tests by running + +``` +ctest -N +``` + +We can also run them separately, here is a separate example execution. + +``` +./bin/example_gemm_xdl_fp16 1 1 1 +``` + +The arguments "1 1 1" mean that we want to run this example in the mode: verify results with CPU, initialize matrices with integers and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change. + +If everything goes well and you have a device based on gfx908 or gfx90a architecture you should see something like + +``` +a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} +b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} +c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} +launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 1.10017 ms, 117.117 TFlops, 87.6854 GB/s, DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 +``` + +Meanwhile, running it on a gfx1030 device should result in + +``` +a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} +b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} +c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} +DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem +``` + +But don't panic, some of the operators are supported on gfx1030 architecture, so you can run a separate example like + +``` +./bin/example_gemm_dl_fp16 1 1 1 +``` + +and it should result in something nice similar to + +``` +a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096} +b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1} +c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} +arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2} +arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2} +arg.c_grid_desc_m_n_{ 3840, 4096} +launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1> +``` + +Or we can run a separate test + +``` +ctest -R test_gemm_fp16 +``` + +If everything goes well you should see something like + +``` +Start 121: test_gemm_fp16 +1/1 Test #121: test_gemm_fp16 ................... Passed 51.81 sec + +100% tests passed, 0 tests failed out of 1 +``` + +## Summary + +In this tutorial we took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different configs to find out the best one for your hardware and task. + +P.S.: Don't forget to switch out the cloud instance if you have launched one, you can find better ways to spend your money for sure! diff --git a/docs/Doxyfile b/docs/Doxyfile new file mode 100644 index 0000000000000000000000000000000000000000..958b3b6f448a5ca583278d64a3b02d8c3c196f22 --- /dev/null +++ b/docs/Doxyfile @@ -0,0 +1,2453 @@ +# Doxyfile 1.8.10 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "ck" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = v3.0.1.0 + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HiP" + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = ./rocm.jpg + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = docBin + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = YES + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = YES + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = YES + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = ../library/include \ + ../library/include/internal + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf, *.as and *.js. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.as \ + *.js + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = ../README.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# compiled with the --with-libclang option. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 1 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = YES + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /